# Data Mining: X Days

In [None]:
import pandas as pd
import numpy as np
from datetime import date

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')


## Connect with the Database

In [None]:
from contextlib import closing


import psycopg2
import simplejson
import sys
reload(sys)
sys.setdefaultencoding('utf8')

DEFAULT_DB = 'data_depot'
DEFAULT_HOST = '-data.c8exzn6geij3.us-east-1.redshift.amazonaws.com'
DEFAULT_PORT = 5439


class PsycopgConnector:
    '''
    A database connector that uses Psycopg to connect to Redshift.

    How to play:

        psy_conn = PsycopgConnector(username, password)
        df = psy_conn.run_query(sql=sql, return_data=True)

    NOTE: This class commits queries to redshift if return_data=False.
    This means INSERT, DROP, TRUNCATE, etc. all work against the DB.
    '''

    def __init__(
        self,
        username=None,
        password=None,
        db=DEFAULT_DB,
        host=DEFAULT_HOST,
        port=DEFAULT_PORT,
    ):

        self.db = DEFAULT_DB
        self.host = DEFAULT_HOST
        self.port = DEFAULT_PORT

        self.username = username
        self.password = password

    def _get_connection(self):

        self.conn = psycopg2.connect(
            dbname=self.db,
            user=self.username,
            password=self.password,
            host=self.host,
            port=self.port
        )

        return self.conn

    def run_query(self, sql, return_data=False):

        with closing(self._get_connection()) as conn:
            with conn, conn.cursor() as cur:
                if return_data:
                    return pd.read_sql(sql=sql, con=conn)
                else:
                    cur.execute(sql)
                    

# Read the credentials file 
with open("creds.json.nogit") as fh:
    creds = simplejson.loads(fh.read())
    
username = creds.get("user_name")
password = creds.get("password")

pig = PsycopgConnector(username, password)

## Functions

In [None]:
# Word count fuction
import re
def words_count (strg):
    
    #print(strg)
    
    if strg == '' or pd.isnull(strg):
        no_of_words = 0
        #print('NaN')
    else:
        strg_words_list = re.findall(r"[\w']+", strg)
        no_of_words = len(strg_words_list)

        
        #print(strg_words_list)
    
    return no_of_words 
    

# 1. Import Invoice Data & Extract Avg Word Counts Features

## 1.01 Invoice within X days

In [None]:
# SQL for impoorting all invoices created within X days after signup_date
sql_invoices_Xdays_all_accounts = '''invocie created in x days'''

# Import as dataframe from redshift
df_invoices_Xdays_all_accounts = pig.run_query(sql_invoices_Xdays_all_accounts, return_data=True)


# Words count in invoice's description, notes, terms, address
df_invoices_Xdays_all_accounts['wc_short_text_column_1'] = df_invoices_Xdays_all_accounts.apply(lambda x: words_count(x['description']), axis=1)
df_invoices_Xdays_all_accounts['wc_short_text_column_1'] = df_invoices_Xdays_all_accounts.apply(lambda x: words_count(x['notes']), axis=1)
df_invoices_Xdays_all_accounts['wc_short_text_column_1'] = df_invoices_Xdays_all_accounts.apply(lambda x: words_count(x['terms']), axis=1)
df_invoices_Xdays_all_accounts['wc_short_text_column_1'] = df_invoices_Xdays_all_accounts.apply(lambda x: words_count(x['address']), axis=1)

                                                                                                                   
# Filters the text columns from the dataframe
df_invoices_Xdays_all_accounts_fil = df_invoices_Xdays_all_accounts.filter(['short_text_column_list'])  
                                                                                                                   
# Summing (grouping) all invoices for a 'id'
df_word_count_Xdays_all_accounts_total = df_invoices_Xdays_all_accounts_fil.groupby('id').mean()  

# Final word count table
df_word_count_Xdays_all_accounts_final = df_word_count_Xdays_all_accounts_total.filter([
                                                                            'id',
                                                                            'signup_date',
                                                                            'avg_wc_des', 
                                                                            'avg_wc_nt', 
                                                                            'avg_wc_trm',
                                                                            'avg_wc_adr'])

# 2. Features


In [None]:
################# Import RSystems, Periodic Invoices & Client Counts Data ###############

# SQL query 
sql_invoices_clients_activities_all_accounts = ''' SQL QUERIES'''

# Import as dataframe 
df_invoices_clients_activities_all_accounts = pig.run_query(sql_invoices_clients_activities_all_accounts, return_data=True)


# 4. Import and Exract Events 
## 4.1 Event data collection 

In [None]:
############################### Event Features Extraction ################################

#SQL for events 
sql_events = '''SQL QUERY'''

# Import as dataframe from redshift
# df_events_all_accounts = pd.read_sql_query(sql_events, connect_to_db)
df_events_all_accounts = pig.run_query(sql_events, return_data=True)


## 4.2 Removing whitespce from the event strings

In [None]:
# Removing row if there is 'None' the event cell
df_events_all_accounts = df_events_all_accounts[~df_events_all_accounts.astype(str).eq('None').any(1)]

# Replace the 'NaN' cell by zero
df_events_all_accounts.fillna(0, inplace=True)

# Using lambda function to remove the white space in the event string name
df_events_all_accounts['event_name'] = df_events_all_accounts.apply(lambda x: x['event'].replace(' ', '').replace('-','').replace('/', ''), axis=1)

In [None]:
# Filtered the events columns for day X
df_events_all_accounts_day_X = df_events_all_accounts[['id', 'event_count_day_X', 'event_name']]

## 4.2 Pivote the events (each unique event become a column)

In [None]:
# Pivot table based on the unique column value in 'event_name'
df_events_all_accounts_day_X = df_events_all_accounts_day_X.pivot_table(values='event_count_day_X', columns='event_name', index='id', aggfunc=np.sum,  fill_value=0)

# Drop the old column name
df_events_all_accounts_day_X.columns.name = None

# Reset the index
df_events_all_accounts_day_X = df_events_all_accounts_day_X.reset_index()

# Replace 'NaN' with zero
df_events_all_accounts_day_X.fillna(0, inplace=True)

# 5. Merging all data

In [None]:
# Merging report system and events data for day X period
df_events_day_X = pd.merge(df_invoices_clients_activities_all_accounts, df_events_all_accounts_day_X,
                             on='id', how='left')

# Merging average word count with 'df_events_day_X'
df_events_avg_wc_day_X = pd.merge(df_events_day_X, df_word_count_Xdays_all_accounts_final,
                                    on='id', how='left')

# 6. Filtering out test accounts 

In [None]:
################# Filtering Out Test Accounts #############################################################

# Import test accounts email from CSV file 
test_emails = pd.read_csv("path", sep="\t")
test_email_list = list(test_emails['email'])

In [None]:
# Function: Filtering  test account by using admin email
from difflib import SequenceMatcher

def email_match(em, email_list):
    
    L = len(email_list)
#     print('L', L)
#     print('em-before-loop: ', em)
    match_score = 0
#     x = float(em)
    
    for i in range(0, L):
#         if math.isnan(x):
#             match_score = 0
#             break;
        if pd.isnull(em):
            match_score = 0
            break;
        else: 
            match_score =  max(match_score, SequenceMatcher(None,em, email_list[i]).ratio())
#             print(i, em, email_list[i], match_score)

    return match_score
    


In [None]:
# Filtering final data from the  Test emails
df_events_avg_wc_day_X_notest = df_events_avg_wc_day_X[
    df_events_avg_wc_day_X.apply(lambda x: email_match(x['admin_email'], test_email_list) < 0.95, axis=1)]

# 7. Filtering only important features: Day X

In [None]:
# Importing importing features list
important_features = pd.read_csv( "path", sep="\n,")

# Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

In [None]:
len(imp_features_list)

In [None]:
# Adding missing important feature column with zero values (if there any!)
for i in range(len(imp_features_list)):
    if imp_features_list[i] in df_events_avg_wc_day_X_noƒtest.columns:
#         print("True")
        continue;
        
    else:
        print("False: ", imp_features_list[i])
        df_events_avg_wc_day_X_notest[imp_features_list[i]] = 0


In [None]:
# Filtering only important features 
df_imp_features_new_accounts_day_X =\
            df_events_avg_wc_day_X_notest[df_events_avg_wc_day_X_notest.columns.intersection(imp_features_list)]

df_imp_features_new_accounts_day_X = df_imp_features_new_accounts_day_X.reindex(
    sorted(df_imp_features_new_accounts_day_X.columns), axis=1)

In [None]:
# Drop rows with NaN
df_imp_features_new_accounts_day_X = df_imp_features_new_accounts_day_X.dropna()

# 8. Filtering inactive users' accounts

In [None]:
ex_cols_list = ['admin_email','days_on_platform', 'effective_date', 'signup_date', 'id']
cols_list = list(df_imp_features_new_accounts_day_X) 
cols = list(set(cols_list) - set(ex_cols_list))

In [None]:
# Function for aggregating selected column values
def cell_value_sum (row, cols):
    #print(row)
    sum = 0
    for i in cols:
        #print(i)
        #print(i, row[i])
        sum = sum + row[i]
    
    #print('Final sum: ', sum)
    return sum


In [None]:
# Fltering out all inactive users accounts
df_final_features_new_accounts_day_X =\
        df_imp_features_new_accounts_day_X[df_imp_features_new_accounts_day_X.apply(lambda x: cell_value_sum(x, cols) > 0, axis=1)]

# 9. Saving the filtered features data for new accounts

In [None]:
# Export filtered features data fro new accounts
today = str(date.today())
path = "path"
df_final_features_new_accounts_day_X.to_csv(path, sep="\t", index=False)