In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
start_set = {'category': [],
             'month' : [],
             'year' : [],
             'clicks' : []
           }

full_set = pd.DataFrame(start_set)

In [3]:
num = 1

def parse_dataset(num):
    file_path = f'../data/pis_{num}.csv'
    df = pd.read_csv(file_path)#, names = ['category', 'month', 'year', 'clicks'])
    return df

In [4]:
for num in range(1, 101):
    temp_df = parse_dataset(num)
    #df['year'] = year
    full_set = pd.concat([temp_df, full_set])

In [5]:
df = full_set.reset_index(drop=True)

# Tasks
### Create an analysis to help the content team and answer following questions:

1. Is there a seasonal pattern for all categories?
2. What are common patterns that we observe within the categories?
3. Are there categories with a similar fingerprint?
4. Now that you know the seasonal pattern, what next actions would you recommend for the company?
5. Create a presentation to show what you did. Make your results accessible for a broad audience so that they can use it.

In [6]:
df['category'].nunique()

100

In [7]:
## CHECK FOR NA's

df.isnull().sum()

category     0
month        0
year         0
clicks      30
dtype: int64

In [8]:
null_index = df[df['clicks'].isnull()].index.tolist()

In [9]:
df = df.sort_values(by=['category','month']).reset_index(drop=True)

In [10]:
## FILL NA VALUES WITH AN AVG TAKEN FROM VALUE ABOVE & BELOW

df['clicks'] = df['clicks'].fillna((df['clicks'].shift() + df['clicks'].shift(-1)) / 2)

In [36]:
imputed_vals = df.loc[null_index].reset_index(drop=True)
imputed_vals = imputed_vals[['category','month','clicks','avg_cat_clicks']].sort_values(by=['category','month'])
imputed_vals['imputed_clicks'] = imputed_vals['clicks']
imputed_vals['imputed_diff_avg'] = (imputed_vals['imputed_clicks'] - imputed_vals['avg_cat_clicks'])
imputed_vals = imputed_vals[['category','month','avg_cat_clicks','imputed_clicks','imputed_diff_avg']]
imputed_vals

Unnamed: 0,category,month,avg_cat_clicks,imputed_clicks,imputed_diff_avg
0,All-in-One PCs,6,76766.416667,52846,-23920.416667
1,Drills & Bits,5,68036.666667,67636,-400.666667
2,Epilators & Ladyshavers,10,61210.0,41129,-20081.0
3,"Eye, Ear & Nose Medicines",4,56615.916667,61113,4497.083333
4,Joint & muscle preparations,3,84028.166667,95780,11751.833333
5,Nursing Gift Sets,7,52633.083333,39030,-13603.083333
6,PowerLine,7,69059.166667,63422,-5637.166667
7,Razor Blades & Shaving Heads,11,76386.333333,97742,21355.666667
8,Razor Blades & Shaving Heads,12,76386.333333,75069,-1317.333333
9,Ski helmets & snowboard helmets,2,76446.666667,141912,65465.333333


In [37]:
imputed_vals.to_csv('../data/imputed_vals.csv', index=False)

In [12]:
## CONVERT FLOATS TO INTEGERS

df['year'] = df['year'].astype(int)
df['month'] = df['month'].astype(int)
df['clicks'] = df['clicks'].astype(int)

In [13]:
df.shape

(1200, 4)

In [None]:
## BIN MONTHS INTO SEASONS

In [14]:
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:
        return 'Unknown'

# Apply the function to create a new 'Season' column
df['season'] = df['month'].apply(get_season)
df=df[['category','year','season','month','clicks']]
df

Unnamed: 0,category,year,season,month,clicks
0,Accessories for kitchen appliances,2021,Winter,1,81676
1,Accessories for kitchen appliances,2021,Winter,2,59529
2,Accessories for kitchen appliances,2021,Spring,3,64537
3,Accessories for kitchen appliances,2021,Spring,4,55455
4,Accessories for kitchen appliances,2021,Spring,5,52882
...,...,...,...,...,...
1195,washer dryer,2021,Summer,8,54857
1196,washer dryer,2021,Autumn,9,63418
1197,washer dryer,2021,Autumn,10,72819
1198,washer dryer,2021,Autumn,11,101180


In [None]:
## ADD COLUMNS FOR AVG etc

In [17]:
df['ttl_cat_clicks'] = df.groupby('category')['clicks'].transform('sum')

df['min_cat_clicks'] = df.groupby('category')['clicks'].transform('min')
df['avg_cat_clicks'] = df.groupby('category')['clicks'].transform('mean')
df['max_cat_clicks'] = df.groupby('category')['clicks'].transform('max')

df['std_cat_clicks'] = df.groupby('category')['clicks'].transform('std')

df['co_var_cat']=(df['std_cat_clicks'] / df['avg_cat_clicks']) * 100

    ## The idxmin function is used to find the index (which corresponds to the month) 
    ## where the minimum value occurs within each group.

min_month_per_category = df.loc[df.groupby('category')['clicks'].idxmin()][['category', 'month']]
max_month_per_category = df.loc[df.groupby('category')['clicks'].idxmax()][['category', 'month']]


df_merged = pd.merge(df, min_month_per_category, on='category', how='left', suffixes=('', '_min_clicks'))
df_merged2 = pd.merge(df_merged, max_month_per_category, on='category', how='left', suffixes=('', '_max_clicks'))
df_merged2


df = df_merged2[['category','year','season','month','clicks',
                 'ttl_cat_clicks','month_min_clicks','min_cat_clicks',
                 'max_cat_clicks','month_max_clicks',
                 'avg_cat_clicks','std_cat_clicks','co_var_cat']]

df['mnth_ranking'] = df.groupby('category')['clicks'].rank(ascending=False)
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mnth_ranking'] = df.groupby('category')['clicks'].rank(ascending=False)


Unnamed: 0,category,year,season,month,clicks,ttl_cat_clicks,month_min_clicks,min_cat_clicks,max_cat_clicks,month_max_clicks,avg_cat_clicks,std_cat_clicks,co_var_cat,mnth_ranking
0,Accessories for kitchen appliances,2021,Winter,1,81676,774200,6,41856,118898,12,64516.666667,24704.093137,38.291025,3.0
1,Accessories for kitchen appliances,2021,Winter,2,59529,774200,6,41856,118898,12,64516.666667,24704.093137,38.291025,5.0
2,Accessories for kitchen appliances,2021,Spring,3,64537,774200,6,41856,118898,12,64516.666667,24704.093137,38.291025,4.0
3,Accessories for kitchen appliances,2021,Spring,4,55455,774200,6,41856,118898,12,64516.666667,24704.093137,38.291025,7.0
4,Accessories for kitchen appliances,2021,Spring,5,52882,774200,6,41856,118898,12,64516.666667,24704.093137,38.291025,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,washer dryer,2021,Summer,8,54857,761006,6,46658,101180,11,63417.166667,15479.855689,24.409567,7.0
1196,washer dryer,2021,Autumn,9,63418,761006,6,46658,101180,11,63417.166667,15479.855689,24.409567,6.0
1197,washer dryer,2021,Autumn,10,72819,761006,6,46658,101180,11,63417.166667,15479.855689,24.409567,3.0
1198,washer dryer,2021,Autumn,11,101180,761006,6,46658,101180,11,63417.166667,15479.855689,24.409567,1.0


In [None]:
df.to_csv('../data/combined_smartbuy.csv', index=False)

In [19]:
agg_df = df[['category','ttl_cat_clicks','month_min_clicks','min_cat_clicks',
             'max_cat_clicks','month_max_clicks',
             'avg_cat_clicks','std_cat_clicks','co_var_cat',]]

agg_df.drop_duplicates(subset='category', keep='first', inplace=True)
agg_df.reset_index(drop=True, inplace=True)

agg_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  agg_df.drop_duplicates(subset='category', keep='first', inplace=True)


Unnamed: 0,category,ttl_cat_clicks,month_min_clicks,min_cat_clicks,max_cat_clicks,month_max_clicks,avg_cat_clicks,std_cat_clicks,co_var_cat
0,Accessories for kitchen appliances,774200,6,41856,118898,12,64516.666667,24704.093137,38.291025
1,All-in-One PCs,921197,6,52846,134469,11,76766.416667,23784.127352,30.982464
2,"Champagne, Sparkling Wine & Prosecco",656334,6,43985,81142,11,54694.500000,11131.379858,20.351918
3,Coffee & espresso,621284,6,36226,61849,3,51773.666667,6848.562149,13.227887
4,Coffee machine cleaning,700288,6,44302,81720,1,58357.333333,9751.063796,16.709235
...,...,...,...,...,...,...,...,...,...
95,tracksuits,908321,6,53258,97270,11,75693.416667,13233.028695,17.482404
96,trekking sandals,694350,12,14641,143488,6,57862.500000,45773.249408,79.106934
97,turntable,643784,8,35788,86713,12,53648.666667,18823.303702,35.086247
98,umbrellas,737030,12,8408,160126,6,61419.166667,53278.747391,86.746125


In [None]:
agg_df.sort_values(by='co_var_cat').head(20)

In [None]:
agg_df.to_csv('../data/agg_smartbuy.csv', index=False)

In [26]:
df_num = agg_df.set_index('category').transpose()
df_num

category,Accessories for kitchen appliances,All-in-One PCs,"Champagne, Sparkling Wine & Prosecco",Coffee & espresso,Coffee machine cleaning,Drills & Bits,Epilators & Ladyshavers,"Eye, Ear & Nose Medicines",Fireplaces & Stoves,Garbage can,...,stylus,sun care,thermos flasks,thermostats,toy vehicles,tracksuits,trekking sandals,turntable,umbrellas,washer dryer
ttl_cat_clicks,774200.0,921197.0,656334.0,621284.0,700288.0,816440.0,734520.0,679391.0,859562.0,760390.0,...,787384.0,822477.0,733768.0,1049359.0,823536.0,908321.0,694350.0,643784.0,737030.0,761006.0
month_min_clicks,6.0,6.0,6.0,6.0,6.0,2.0,2.0,8.0,6.0,6.0,...,6.0,12.0,9.0,6.0,6.0,6.0,12.0,8.0,12.0,6.0
min_cat_clicks,41856.0,52846.0,43985.0,36226.0,44302.0,52772.0,35819.0,48186.0,25040.0,47930.0,...,35969.0,26556.0,34906.0,28530.0,25911.0,53258.0,14641.0,35788.0,8408.0,46658.0
max_cat_clicks,118898.0,134469.0,81142.0,61849.0,81720.0,87129.0,93155.0,66167.0,132857.0,76567.0,...,126711.0,180883.0,97556.0,198969.0,200438.0,97270.0,143488.0,86713.0,160126.0,101180.0
month_max_clicks,12.0,11.0,11.0,3.0,1.0,10.0,11.0,1.0,11.0,1.0,...,11.0,6.0,1.0,11.0,12.0,11.0,6.0,12.0,6.0,11.0
avg_cat_clicks,64516.666667,76766.416667,54694.5,51773.666667,58357.333333,68036.666667,61210.0,56615.916667,71630.166667,63365.833333,...,65615.333333,68539.75,61147.333333,87446.58,68628.0,75693.416667,57862.5,53648.666667,61419.166667,63417.166667
std_cat_clicks,24704.093137,23784.127352,11131.379858,6848.562149,9751.063796,10828.154576,17145.890236,6262.510598,36987.834486,8261.293701,...,32118.907853,48369.255935,19172.446903,54278.33,59480.538198,13233.028695,45773.249408,18823.303702,53278.747391,15479.855689
co_var_cat,38.291025,30.982464,20.351918,13.227887,16.709235,15.915175,28.011583,11.061396,51.637231,13.037458,...,48.950308,70.571101,31.35451,62.07027,86.670948,17.482404,79.106934,35.086247,86.746125,24.409567


In [27]:
df_corr = df_num.corr()
df_corr

category,Accessories for kitchen appliances,All-in-One PCs,"Champagne, Sparkling Wine & Prosecco",Coffee & espresso,Coffee machine cleaning,Drills & Bits,Epilators & Ladyshavers,"Eye, Ear & Nose Medicines",Fireplaces & Stoves,Garbage can,...,stylus,sun care,thermos flasks,thermostats,toy vehicles,tracksuits,trekking sandals,turntable,umbrellas,washer dryer
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accessories for kitchen appliances,1.000000,0.999944,0.999292,0.998338,0.999084,0.998616,0.999631,0.997889,0.999558,0.998308,...,0.999880,0.996961,0.999779,0.998588,0.994636,0.998761,0.997126,0.999807,0.995703,0.999675
All-in-One PCs,0.999944,1.000000,0.999626,0.998840,0.999467,0.999087,0.999799,0.998490,0.999288,0.998829,...,0.999668,0.996114,0.999878,0.997986,0.993527,0.999188,0.996277,0.999920,0.994682,0.999885
"Champagne, Sparkling Wine & Prosecco",0.999292,0.999626,1.000000,0.999700,0.999972,0.999856,0.999772,0.999607,0.998152,0.999735,...,0.998646,0.993341,0.999665,0.995916,0.990078,0.999847,0.993599,0.999765,0.991561,0.999924
Coffee & espresso,0.998338,0.998840,0.999700,1.000000,0.999851,0.999962,0.999472,0.999905,0.997277,0.999988,...,0.997551,0.990918,0.999225,0.994227,0.987103,0.999968,0.991530,0.999247,0.989285,0.999429
Coffee machine cleaning,0.999084,0.999467,0.999972,0.999851,1.000000,0.999945,0.999757,0.999745,0.997982,0.999868,...,0.998399,0.992728,0.999611,0.995517,0.989301,0.999945,0.993088,0.999676,0.990997,0.999845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tracksuits,0.998761,0.999188,0.999847,0.999968,0.999945,0.999978,0.999684,0.999834,0.997770,0.999958,...,0.998062,0.991926,0.999491,0.995002,0.988317,1.000000,0.992473,0.999522,0.990335,0.999661
trekking sandals,0.997126,0.996277,0.993599,0.991530,0.993088,0.991970,0.995222,0.990282,0.998326,0.991326,...,0.998127,0.999807,0.995866,0.999725,0.999302,0.992473,1.000000,0.995701,0.999855,0.994899
turntable,0.999807,0.999920,0.999765,0.999247,0.999676,0.999414,0.999947,0.998905,0.999219,0.999218,...,0.999496,0.995326,0.999967,0.997536,0.992525,0.999522,0.995701,1.000000,0.994019,0.999932
umbrellas,0.995703,0.994682,0.991561,0.989285,0.990997,0.989742,0.993498,0.987838,0.997309,0.989033,...,0.996958,0.999628,0.994247,0.999204,0.999502,0.990335,0.999855,0.994019,1.000000,0.993065
