Prepering data:

In [1]:
import pandas as pd
import numpy as np
import json

file_csv = "Data/proj1_ex01.csv"
file_json = "proj1_ex01_fields.json"

columns_array = []
df = pd.read_csv(file_csv)

columns_names = df.head(0).columns.values

for column in columns_names:
    data = df[column]
    temp_dict = {
        "name" :    data.name,
        "missing" : data.isnull().mean(),
        "dtype" :   "int" if np.issubdtype(data.dtypes, np.integer) else
                    "float" if np.issubdtype(data.dtypes, np.floating) else
                    "other"}
    columns_array.append(temp_dict)

with open (file_json, "w") as file:
    json.dump(columns_array, file, indent=4)

Value statistics:

In [2]:
def string_statistics(data: pd.Series):
    value_counts = data.value_counts()
    
    return {
        "count": float(data.count()),
        "unique": int(data.nunique()),
        "top": value_counts.idxmax() if not value_counts.empty else None,
        "freq": int(value_counts.max()) if not value_counts.empty else None
    }

def numeric_statistics(data: pd.Series):
    stats = {
        "count": data.count(),
        "mean": data.mean(),
        "std": data.std(),
        "min": data.min(),
        "25%": data.quantile(0.25),
        "50%": data.median(),
        "75%": data.quantile(0.75),
        "max": data.max()
    }
    
    return {key: float(value) for key, value in stats.items()}

file_ex2 = "proj1_ex02_stats.json"
ex2_results = {}

for column in df.columns:
    data = df[column]
    if pd.api.types.is_numeric_dtype(data):
        result = numeric_statistics(data)
    else:
        result = string_statistics(data)
    
    ex2_results[column] = result

with open(file_ex2, "w") as file:
    json.dump(ex2_results, file, indent=4)

Column names normalizing:

In [3]:
import re

for column in columns_names:
    changed_name = column.replace(' ', '_').lower()
    changed_name = re.sub(r'[^a-zA-Z0-9_]', '', changed_name)
    df.rename(columns={column : changed_name}, inplace=True)

file_ex3 = "proj1_ex03_columns.csv"

with open(file_ex3, 'w') as file:
    df.to_csv(file_ex3, index=False)

Output formats:

In [4]:
try:
    import openpyxl
except ImportError:
    import sys
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openpyxl"])

In [5]:
file_xlsx_ex4 = "proj1_ex04_excel.xlsx"
file_json_ex4 = "proj1_ex04_json.json"
file_pkl_ex4 = "proj1_ex04_pickle.pkl"

df.to_excel(file_xlsx_ex4, index=False)
df.to_json(file_json_ex4, orient="records", indent=4)
df.to_pickle(file_pkl_ex4)

Selecting rows and columns:

In [6]:
file_pkl_ex5 = "Data/proj1_ex05.pkl"

df_ex5 = pd.read_pickle(file_pkl_ex5)

In [7]:
print(df_ex5)

                    name          description   age
v                      V      Freedom fighter   NaN
evey        Evey Hammond        Revolutionary  16.0
finch         Eric Finch     Police detective  40.0
creedy      Peter Creedy  Government official  49.0
gordon   Gordon Deitrich       Talk show host  38.0
valerie     Valerie Page              Actress   NaN
delia     Delia Surridge   Medical researcher  50.0


In [8]:
try:
    import tabulate
except ImportError:
    import sys
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "tabulate"])

In [9]:
df_selected = df_ex5.iloc[:, 1:3]
df_selected = df_selected[df_selected.index.astype(str).str.startswith('v')]
markdown_table = df_selected.to_markdown(index=True).replace("nan", "")

file_md_ex5 = "proj1_ex05_table.md"

with open(file_md_ex5, "w") as file:
    file.write(markdown_table)

Flattening data:

In [10]:
file_pkl_ex6 = "proj1_ex06_pickle.pkl"
file_json_ex6 = "Data/proj1_ex06.json"

with open(file_json_ex6, 'r') as file:
    json_ex6 = json.load(file)

In [11]:
df_ex6 = pd.json_normalize(json_ex6, sep='.')

In [None]:
df_ex6.to_pickle(file_pkl_ex6)