<a href="https://colab.research.google.com/github/gabrielborja/parc_de_montjuic/blob/main/cleaning_scripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cleaning scripts

## Importing libraries

In [None]:
# Upgrade Matplotlib
!pip install matplotlib --upgrade
#!pip install plotly --upgrade

In [None]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from ipywidgets import interact

## Loading Data

In [None]:
# Loading data from local drive
from google.colab import files
uploaded1 = files.upload()

In [None]:
# Storing loaded data from excel to a pandas dataframe
#import io
#df1 = pd.read_excel(io.BytesIO(uploaded1['2023_04_12_data.xlsx']))

In [None]:
# Storing loaded data from csv to a pandas dataframe
import io
df1 = pd.read_csv(io.BytesIO(uploaded1['mdrift.csv']), sep=';', engine='python')#, encoding='latin-1')
#quotechar='"', on_bad_lines=False)

In [None]:
# Modifying columns to lower case
df1.rename(columns={i:i.lower() for i in df1.columns}, inplace=True)

In [None]:
df1.rename(columns={'operator_in_commercial_name':'operator_in'}, inplace=True)

In [None]:
# Checking dataframe info
pd.DataFrame(df1.info())

## Cleaning Data

In [None]:
#pd.concat(pd.DataFrame(df1.dtypes),df1.count())
df1.dtypes
#pd.concat(df1.dtypes, df1.count())
#df1.count()
#pd.concat()

In [None]:
# Parsing object column to datetime
#df1['inserted_date'] = pd.to_datetime(df1['inserted_date']) #format='m%/%d/%Y %I:%M:%S %p')

date_col = 'timestamp' #-> Change to the desired column
def parse_datetime(date_col):
  df1[date_col] = pd.to_datetime(df1[date_col], format='%Y-%m-%d %H:%M:%S.%f')

parse_datetime(date_col)

In [None]:
# Extracting year, month, day of the week, hour and date categories

date_col = 'timestamp' #-> Change to the desired column
def extract_datetime(df, date_col):
  df = df.assign(year = df[date_col].dt.year,
                 month = df[date_col].dt.month_name(),
                 weeknum = df[date_col].dt.isocalendar().week,
                 week_of_year = 'w' + df[date_col].dt.isocalendar().week.astype(str),
                 day_of_week = df[date_col].dt.day_name(),
                 hour = df[date_col].dt.hour.astype(str) + 'h',
                 hour_of_day = df[date_col].dt.hour)
  return df

df1 = extract_datetime(df1, date_col)

In [None]:
# Cleaning invalid orgnum

org_col = 'customer_orgnumber' #-> Change to the desired column
def check_valid_org_num(df):
  df = df.assign(org_check = np.where(df['org_col'].isnull(), 'Wrong',
                             np.where(df['org_col'].str.len()>9, 'Wrong',
                             np.where(df['org_col'].str.match(r'\d{9}'),'Valid','Wrong'))))
  return df

df1 = check_valid_org_num(df1)

In [None]:
# Transforming nace code
df1 = df1.assign(market_nace_main_desc = df1['market_nace_main_desc'].str[:22])

In [None]:
# Slicing county
df1 = df1.assign(county = df1['county'].str[:16])

In [None]:
# Assigning brand preference
df1 = df1.assign(brand = np.where(df1['product_name'].str.contains('iPhone'),'Apple','Samsung'))

In [None]:
# Creating categorical orgnum column
df1 = df1.assign(org_num = 'ORG' + df1['organization_number'].astype(str))

In [None]:
# Creating time of day column
df1 = df1.assign(time_of_day = np.where(df1['hr']<=11,'Morning',np.where(df1['hr']<=14,'Lunch','Afternoon')))

In [None]:
# Customizing terminal categories
df1 = df1.assign(product_choice = np.where(df1['product_name'].str.contains('iPhone 11'),'iPhone 11',
                                           np.where(df1['product_name'].str.contains('iPhone 12'),'iPhone 12',
                                                    np.where(df1['product_name'].str.contains('iPhone 13'),'iPhone 13',
                                                             np.where(df1['product_name'].str.contains('iPhone 14'),'iPhone 14',
                                                                      np.where(df1['product_name'].str.contains('OnePlus'),'OnePlus',
                                                                               'Samsung'))))))

In [None]:
# Checking the dataframe tail
df1.tail(1)

In [None]:
# Selecting categorical and numeric columns for EDA
custom_list_1 = df1.select_dtypes(include=['object']).copy().columns.tolist()
custom_list_2 = [i for i in df1.select_dtypes(include=['float64','int64']).copy().columns.tolist() if '_id' not in i]

In [None]:
[i for i in df1.select_dtypes(include=['float64','int64']).copy().columns.tolist() if '_id' not in i]

In [None]:
"-".join([i[0:2] for i in df1['nace_main'][0].split(' ')])

In [None]:
df1[df1['org_check']=='Valid'][['inserted_date','customer_orgnumber','org_check']].reset_index()

In [None]:
# Temp1: Slicing dataframe 
df2 = df1[['orderid','statustext','username','listname','inserted_date','customer_orgnumber','customer_company',
'customer_phone','customer_cellphone','customer_email','accept_date','productname','nummer som skal benytte',
'orderproductsquantity','productquantity','year','month','weeknum','day','hour','hr','org_check']].copy()
df2 = df2[df2['org_check']=='Valid'].reset_index(drop=True)

In [None]:
# Extracting keys from column and flattening list of list to obtain unique set of keys.
temp1 = [list(json.loads(i).keys()) for i in df1['custom_dimensions']]
flat = [i for x in temp1 for i in x]
set(flat)

## Exploring Data

In [None]:
# Exploring list of categorical columns with value counts
@interact(Column_name=custom_list_1, Percentage=[True, False])
def explore_value_counts(Column_name, Percentage):
  if Percentage == True:
    df = df1.value_counts(subset=[Column_name], normalize=Percentage, dropna=False).reset_index(name='percent')
    df = df.assign(percent = round(df['percent'],2)*100)
    df = df.assign(percent = df['percent'].round(2).astype(str) + '%')
  else:
    df = df1.value_counts(subset=[Column_name], normalize=Percentage, dropna=False).reset_index(name='events')
    #df = df.sort_values(by=['events']).reset_index(drop=True)
  return df

In [None]:
# Exploring list of categorical columns with value counts
@interact(Categories=custom_list_1, Column_name=custom_list_2, Aggregate=['count','nunique','sum','mean','max','min'])
def explore_numeric_columns(Categories, Column_name, Aggregate):
  df = df1.groupby(by=Categories).agg(value=(Column_name,Aggregate)).reset_index()
  df.rename(columns={'value':Column_name}, inplace=True)
  df = df.sort_values(by=[Column_name],ascending=False).reset_index(drop=True)
  return df

In [None]:
# Visualizing numeric columns dynamically
@interact(Numeric_column=reversed(custom_list_2))
def visualize_numeric(Numeric_column):
  # 'count', 'frequency','probability', 'proportion', 'percent', 'density'
  sns.set(rc={'figure.figsize':(10,5)})
  sns.histplot(x=Numeric_column, data=df1, stat='count',color='gold', binwidth=2) #color='green'
  plt.title(f'Histogram - {Numeric_column}')
  plt.show()

In [None]:
# Visualizing time columns
@interact(Time_column=['month','week_of_year','day_of_week','hour'])
def visualize_time_dimension(Time_column):
  sns.catplot(x=Time_column, data=df1, kind='count', color='#0080FF', height=5, aspect=2.0,
              order=['Monday','Tuesday','Wednesday','Thursday','Saturday','Sunday']
              #order=['0h','1h','2h','3h','4h','5h','6h','7h','8h','9h','10h','11h','12h',
              #'13h','14h','15h','16h','17h','18h','19h','20h','21h','22h','23h']) #'#0080FF', '#070707', '#454545'
  )
  plt.title(f'Events by {Time_column}')
  #plt.axhline(y=43, color='red', label='avg_week')
  #plt.legend(bbox_to_anchor = (1.1, 1), loc = 'upper center')
  plt.show()

In [None]:
# Interacting with list of columns
@interact(Nace = df1['market_nace_main_desc'].unique().tolist(), Variable = custom_list_2)
def visualize_nace(Nace, Variable):
  df = df1[df1['market_nace_main_desc']==Nace].copy()
  df = df.groupby(by=['market_nace_main_desc','county']).agg(avg = (Variable,'mean')).reset_index()
  fig_x, axes = plt.subplots(1, 1, figsize=(7.5, 5.5))
  axes.set_title(f'{Nace}: {Variable}')
  sns.barplot(x='market_nace_main_desc', y='avg', data=df, hue='county', palette='tab20', ax=axes)
  plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
  plt.show()

## Aggregating Data

In [None]:
# Selecting columns for aggregation
custom_list_2 = ['', '', '', ''] # => Add column names for slicing here.

In [None]:
# Main data aggregation
df1_a = df1.groupby(by=custom_list_2, dropna=False).agg(CHURN_COUNT = ('CUSTOMER_ID', 'count')).reset_index() # => Note: Dropna=False to avoid dropping data if group keys contain NA values.

In [None]:
# Creating list of aggregated fields
df1_list = [(pd.DataFrame(df1[i].value_counts(dropna=False))) for i in custom_list_1]

## Exporting results to local drive

In [None]:
# Exporting main excel file
with pd.ExcelWriter('script_20230417.xlsx', engine='openpyxl') as writer:
  df2.to_excel(writer, sheet_name='Sheet1', index=False)
files.download('script_20230417.xlsx')

In [None]:
# Exporting list of excel sheets
with pd.ExcelWriter('Script_202208221130.xlsx', engine='openpyxl') as writer:
  for i in range(len(df1_list)-1):
    df1_list[i].to_excel(writer, sheet_name=f'{df1_list[i].columns[0]}', index=True)
files.download('Script_202208221130.xlsx')