In [178]:
import emoji
import sqlite3
import pandas as pd
import numpy as np    
df = pd.read_csv('./reddit_exercise_data.csv', sep=',', escapechar='\n')
df.rename({'app_bought':'apps_bought'}, axis=1,  inplace=True)
df.drop(['product_name'], axis=1, inplace=True)

In [179]:
def parse_dates(df):
    '''There are 3 distinct date formats as per length of the string
    the last will need to be parsed separately since day is occuring first
    There must be more efficient way of doing it with pd.datetime.strptime()
    but this does it for now'''
    
    idx_1 = df.date[df.date.str.len() < 11].index
    idx_2 = df.date[df.date.str.len() > 11].index
    
    for i in idx_1:
        df.loc[i, 'date'] = pd.to_datetime(df.date[i], dayfirst=True, infer_datetime_format=True)
    for i in idx_2:
        df.loc[i, 'date'] = pd.to_datetime(df.date[i], yearfirst=True, infer_datetime_format=True)
    df['date'] = pd.to_datetime(df.date)
    return df

In [180]:
def make_buck(df):
    '''Create equaly spaced buckets, alternatively use
    pd.qcut() to create balanced bukets'''
        
    lbl_app = ['0-20', '20-40', '40-60', '60-80', '80-100']
    df['apps_bought_bucket'] = pd.cut(df.apps_bought, bins = 5, labels=lbl_app)
    lbl_mon = ['0-100', '100-200', '200-300', '300-400', '400-500']
    df['money_spent_bucket'] = pd.cut(df.money_spent, bins = 5, labels=lbl_mon)
    return df

In [181]:
def preprocessing(df):
    '''Create lists of features based on dtype, for indexing and cleaning
    we could also remove non word'''
    cp = df[:]
    parse_dates(cp)
    make_buck(cp)
    num_feat = []
    cat_feat = []
    dt = []
    for i in cp:
        if cp[i].dtype == np.int64 or cp[i].dtype == np.float64:
            num_feat.append(i)
        elif cp[i].dtype == 'datetime64[ns]':
            dt.append(i)
        else:
            cat_feat.append(i)
    for i in cat_feat[:3]:
        #Remove carriage return
        cp[i] = cp[i].str.replace(r'\r', '')
        cp[i] = cp[i].str.replace(r'r/', '')
        cp[i] = cp[i].str.lower()
        #df[i] = df[i].str.replace('[^\w\s]', '')
    #'category' type seems to require less memory for the below columns
    cp['iso'] = cp['iso'].astype('category')
    return cp

In [182]:
x = preprocessing(df)
x.to_csv('DL_preprocessed.csv')

In [186]:
def pop_reviews(df, db_file):
    '''populate the db file and close connection'''
    
    db = sqlite3.connect(db_file)
    c = db.cursor()
    c.execute('DELETE FROM reviews')
    df.to_sql('reviews', db, if_exists='append', index=False)
    c.close()
    return None

In [187]:
pop_reviews(x, './exercise_database.db')

In [202]:
def avg_score_iso(db_file):
    '''create a csv with the avg score by iso'''
    
    db = sqlite3.connect(db_file)
    f_1 = pd.read_sql('SELECT iso, AVG(score) as avg_by_iso FROM reviews GROUP BY iso ORDER BY avg_by_iso ASC', db)
    f_1.to_csv('./avg_by_iso.csv')
    db.close()
    return f_1[:3]

In [203]:
avg_score_iso('./exercise_database.db')

Unnamed: 0,iso,avg_by_iso
0,ec,3.666667
1,bh,4.0
2,hn,4.0


In [206]:
def Maximum_score_by_apps_bought_bucket(db_file):
    '''create a csv with the max score by apps_bought_bucket'''
    
    db = sqlite3.connect(db_file)
    f_2 = pd.read_sql('SELECT apps_bought_bucket, MAX(score) as Max_score FROM reviews GROUP BY apps_bought_bucket ORDER BY Max_score ASC', db)
    f_2.to_csv('./Max_score_by_app_bucket.csv')
    db.close()
    return f_2

In [207]:
Maximum_score_by_apps_bought_bucket('./exercise_database.db')

Unnamed: 0,apps_bought_bucket,Max_score
0,0-20,5
1,20-40,5
2,40-60,5
3,60-80,5
4,80-100,5


In [208]:
def avg_score_day(db_file):
    '''create a csv with the avg score by day'''
    
    db = sqlite3.connect(db_file)
    f_3 = pd.read_sql('SELECT date, AVG(score) as avg_day FROM reviews GROUP BY date ORDER BY avg_day ASC', db)
    f_3.to_csv('./avg_day.csv')
    db.close()
    return f_3[:3]

In [209]:
avg_score_day('./exercise_database.db')

Unnamed: 0,date,avg_day
0,2017-06-21 00:00:00,2.333333
1,2017-06-22 00:00:00,2.8
2,2017-06-25 00:00:00,2.8


In [218]:
def avg_score_day_week(db_file):
    '''create a csv with the avg score by day of week
    Sunday == 0 etc'''
    
    db = sqlite3.connect(db_file)
    f_4 = pd.read_sql('SELECT strftime("%w", date) as day, AVG(score) as avg_by_day_week FROM reviews GROUP BY day ORDER BY avg_by_day_week ASC', db)
    f_4.to_csv('./avg_day_week.csv')
    db.close()
    return f_4

In [219]:
avg_score_day_week('./exercise_database.db')

Unnamed: 0,day,avg_by_day_week
0,3,4.498655
1,4,4.498947
2,5,4.53527
3,0,4.566288
4,1,4.594907
5,2,4.609959
6,6,4.654064


In [220]:
%%sql sqlite:///exercise_database.db


Done.


cid,name,type,notnull,dflt_value,pk
0,review,TEXT,0,,0
1,title,TEXT,0,,0
2,iso,TEXT,0,,0
3,score,INTEGER,0,,0
4,date,TEXT,0,,0
5,apps_bought,INTEGER,0,,0
6,money_spent,NUMERIC,0,,0
7,apps_bought_bucket,TEXT,0,,0
8,money_spent_bucket,TEXT,0,,0
