<a href="https://colab.research.google.com/github/gabrielborja/parc_de_montjuic/blob/main/cleaning_scripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cleaning scripts

## Importing libraries

In [None]:
# Upgrade Matplotlib
!pip install matplotlib --upgrade
#!pip install plotly --upgrade

In [None]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact

## Loading Data

In [None]:
# Loading data from local drive
from google.colab import files
uploaded1 = files.upload()

In [None]:
# Storing loaded data from excel to a pandas dataframe
import io
df1 = pd.read_excel(io.BytesIO(uploaded1['extract.xlsx']))

In [None]:
# Storing loaded data from csv to a pandas dataframe
#import io
#df1 = pd.read_csv(io.BytesIO(uploaded1['Script_202209261147.csv']), sep='|', engine='python')

In [None]:
# Chainging columns to lower case
df1.rename(columns={i:i.lower() for i in df1.columns}, inplace=True)

In [None]:
pd.DataFrame(df1.info())

In [None]:
#pd.concat(pd.DataFrame(df1.dtypes),df1.count())
df1.dtypes
#pd.concat(df1.dtypes, df1.count())
#df1.count()
#pd.concat()

In [None]:
# Slice nace main
df1 = df1.assign(county = df1['county'].str[:16])

In [None]:
# Selecting categorical and numeric columns
custom_list_1 = df1.select_dtypes(include=['object']).copy().columns.tolist()
custom_list_2 = df1.select_dtypes(include=['float64']).copy().columns.tolist()

In [None]:
# Checking the dataframe shape
df1.tail(1)

In [None]:
# Checking the dataframe info
df1.info()

In [None]:
"-".join([i[0:2] for i in df1['nace_main'][0].split(' ')])

## Exploring Data

In [None]:
# Exploring list of categorical columns with value counts
@interact(Column_name=custom_list_1, Percentage=[True, False])
def explore_value_counts(Column_name, Percentage):
  df = df1.value_counts(subset=[Column_name], normalize=Percentage, dropna=False)
  return df

In [None]:
# Interacting with list of columns
@interact(Nace = df1['nace_main'].unique().tolist(), Variable = custom_list_2)
def visualize_nace(Nace, Variable):
  df = df1[df1['nace_main']==Nace].copy()
  df = df.groupby(by=['nace_main','county']).agg(avg = (Variable,'mean')).reset_index()
  fig_x, axes = plt.subplots(1, 1, figsize=(7.5, 5.5))
  axes.set_title(f'{Nace}: {Variable}')
  sns.barplot(x='nace_main', y='avg', data=df, hue='county', palette='tab20', ax=axes)
  plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
  plt.show()

## Aggregating Data

In [None]:
# Selecting columns for aggregation
custom_list_2 = ['', '', '', ''] # => Add column names for slicing here.

In [None]:
# Main data aggregation
df1_a = df1.groupby(by=custom_list_2, dropna=False).agg(CHURN_COUNT = ('CUSTOMER_ID', 'count')).reset_index() # => Note: Dropna=False to avoid dropping data if group keys contain NA values.

In [None]:
# Creating list of aggregated fields
df1_list = [(pd.DataFrame(df1[i].value_counts(dropna=False))) for i in custom_list_1]

## Exporting results to local drive

In [None]:
# Exporting main excel file
with pd.ExcelWriter('Script_20221108.xlsx', engine='openpyxl') as writer:
  df1_a.to_excel(writer, sheet_name='Bedriftsliste', index=False)
files.download('Script_20221108.xlsx')

In [None]:
# Exporting list of excel sheets
with pd.ExcelWriter('Script_202208221130.xlsx', engine='openpyxl') as writer:
  for i in range(len(df1_list)-1):
    df1_list[i].to_excel(writer, sheet_name=f'{df1_list[i].columns[0]}', index=True)
files.download('Script_202208221130.xlsx')