
Overview Class
function 1
1. Extract the csv from 'raw' folder zip named 'archive' to 'processed' folder and rename it to 'rawdata.csv'
function 2
2. show data summary of rawdata.csv like info, percentage of missing data in each column, head and tail for all columns, number of unique features in each column and summary of statistics for numerical values

Note that i'll be executing this class from the notebooks, 001-overview folder overview.ipynb 
Ensure that the output though im using ipynb file is structured in an easy to read way like tables like markdown way 

In [4]:
import os
import zipfile
import pandas as pd

class Overview:
    def __init__(self, parent_folder):
        self.parent_folder = parent_folder
        self.raw_folder = os.path.join(self.parent_folder, 'data', 'raw')
        self.processed_folder = os.path.join(self.parent_folder, 'data', 'processed')

    def extract_and_rename(self):
        zip_path = os.path.join(self.raw_folder, 'archive.zip')
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(self.processed_folder)
        
        # Assuming there's only one CSV in the zip
        extracted_files = [f for f in os.listdir(self.processed_folder) if f.endswith('.csv')]
        if extracted_files:
            rawdata_path = os.path.join(self.processed_folder, 'rawdata.csv')
            # Remove the existing rawdata.csv if it exists
            if os.path.exists(rawdata_path):
                os.remove(rawdata_path)
            os.rename(os.path.join(self.processed_folder, extracted_files[0]), rawdata_path)
        else:
            print("No CSV file found in the archive.")

    def show_data_summary(self):
        csv_path = os.path.join(self.processed_folder, 'rawdata.csv')
        if os.path.exists(csv_path):
            try:
                df = pd.read_csv(csv_path, encoding='utf-16')
            except UnicodeError:
                df = pd.read_csv(csv_path, encoding='utf-8')

            summary = {
                "Info": df.info(),
                "Missing Data (%)": (df.isnull().sum() / len(df)) * 100,
                "Head": df.head(),
                "Tail": df.tail(),
                "Unique Features": df.nunique(),
                "Summary Statistics": df.describe()
            }

            # Display the summary in a structured format
            for key, value in summary.items():
                print(f"## {key}\n")
                if isinstance(value, pd.DataFrame) or isinstance(value, pd.Series):
                    print(value.to_markdown())
                else:
                    print(value)
                print("\n")
        else:
            print("The file 'rawdata.csv' does not exist in the processed folder.")




In [5]:
# Step 1: Define the parent folder
parent_folder = r'T:\Github Projects\data-science-practice\Most-Streamed-Spotify-Songs-2024'

# Step 2: Create an instance of the Overview class
overview = Overview(parent_folder)

# Step 3: Extract and rename the CSV
overview.extract_and_rename()



In [6]:

# Step 4: Show the data summary
overview.show_data_summary()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track                       4600 non-null   object 
 1   Album Name                  4600 non-null   object 
 2   Artist                      4595 non-null   object 
 3   Release Date                4600 non-null   object 
 4   ISRC                        4600 non-null   object 
 5   All Time Rank               4600 non-null   object 
 6   Track Score                 4600 non-null   float64
 7   Spotify Streams             4487 non-null   object 
 8   Spotify Playlist Count      4530 non-null   object 
 9   Spotify Playlist Reach      4528 non-null   object 
 10  Spotify Popularity          3796 non-null   float64
 11  YouTube Views               4292 non-null   object 
 12  YouTube Likes               4285 non-null   object 
 13  TikTok Posts                3427 