In [42]:
import pandas as pd
import numpy as np
import glob
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from string import ascii_lowercase

pd.set_option('display.max_columns', 500)

In [43]:
excel_files = glob.glob('data/*.xlsx')
excel_dfs = []

for file in excel_files:
    df = pd.read_excel(file, index_col=0)
    df['filename'] = file
    excel_dfs.append(df)

excel_df = pd.concat(excel_dfs, ignore_index=True)

excel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1566 entries, 0 to 1565
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Publication Name  1566 non-null   object        
 1   Accepted At Date  1566 non-null   datetime64[ns]
 2   Post Title        1565 non-null   object        
 3   Author            1566 non-null   object        
 4   Post URL          1566 non-null   object        
 5   Tier              1566 non-null   object        
 6   Name              370 non-null    object        
 7   Added Date        1533 non-null   datetime64[ns]
 8   Reads             1566 non-null   int64         
 9   Total Time Read   1566 non-null   float64       
 10  Views             1566 non-null   int64         
 11  Moc Views         1566 non-null   int64         
 12  filename          1566 non-null   object        
dtypes: datetime64[ns](2), float64(1), int64(3), object(7)
memory usage: 159.2+ KB


In [44]:
df = excel_df
df.head()

Unnamed: 0,Publication Name,Accepted At Date,Post Title,Author,Post URL,Tier,Name,Added Date,Reads,Total Time Read,Views,Moc Views,filename
0,Towards Data Science,2020-03-02,CatBoost,Manu Joseph,https://www.medium.com/p/d1f1366aca34,Tier 3,,2020-03-02,0,0.0,0,0,data/Feb_2020.xlsx
1,Towards Data Science,2020-03-02,Exploring SimCLR: A Simple Framework for Contr...,Thalles Silva,https://www.medium.com/p/158c30601e7e,Tier 3,,2020-03-02,0,0.0,0,0,data/Feb_2020.xlsx
2,Towards Data Science,2020-03-02,Monte Carlo Methods,Reuben Kavalov,https://www.medium.com/p/2b14657b7032,Tier 3,,2020-03-02,6,31.546717,15,15,data/Feb_2020.xlsx
3,Towards Data Science,2020-03-01,Selecting a few decision trees to represent a ...,Ahsan Saeed,https://www.medium.com/p/11283e433dbb,Tier 3,,2020-03-01,66,153.033033,178,163,data/Feb_2020.xlsx
4,Towards Data Science,2020-03-01,Self Supervised Depth Estimation: Breaking Dow...,Daryl Tan,https://www.medium.com/p/f212e4f05ffa,Tier 3,,2020-03-01,73,197.612284,157,157,data/Feb_2020.xlsx


In [45]:
# cleaning functions

def clean_col_names(df):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' ','_')
    return df

def drop_cols(df):
    # drop some columns!
    df = df.drop(['publication_name', 
                  'accepted_at_date',
                  'post_url', 
                  'tier',
                  'name',
                  'added_date',
                  'reads',
                  'total_time_read',
                  'moc_views',
                  'filename'], axis=1)
    return df

def drop_dups(df):
    # drop duplicate rows based on post_title
    df = df.drop_duplicates('post_title', keep='first')
    return df

def rename_cols(df):
    # rename columns
    columns = ['product', 'customer_id', 'amount']
    df.columns = columns
    return df


In [46]:
def clean(df):
    
    df = (df.pipe(clean_col_names)
            .pipe(drop_cols)
            .pipe(drop_dups)
            .pipe(rename_cols)
         )
    
    return df
    

In [47]:
df = clean(df)

In [54]:
df.head(10)

Unnamed: 0,product,customer_id,amount
0,outlxwfhyu,customer1,0
1,ukjakcfikb,customer2,0
2,fstxfaulvj,customer3,15
3,wcpurpgyce,customer4,178
4,oyhjphvfbb,customer5,157
5,qlmsbhrwgj,customer6,599
6,glkvilakda,customer7,374
7,fcsdkcwjqc,customer8,68
9,chssubncks,customer10,95
11,eiqzsayimw,customer12,130


In [55]:
df.amount.groupby(df['customer_id']).sum()

customer_id
customer1      1396
customer10       95
customer100     110
customer103    8111
customer104    8803
               ... 
customer95      140
customer96     7345
customer97      408
customer98      325
customer99     1814
Name: amount, Length: 562, dtype: int64

In [51]:
df['product'].nunique(), df['customer_id'].nunique()

(1099, 701)

In [52]:
# convert publication_name and author to random strings
df['product'] = df['product'].apply(lambda x: ''.join(np.random.choice(list(ascii_lowercase), 10)))

In [53]:
# this loses 701 to 562 authors but okay
df['customer_id'] = 'customer' + pd.Series(pd.factorize(df['customer_id'])[0] + 1).astype(str)

In [57]:
df.to_csv('data/sample.csv', index=True)