In [355]:
import sqlite3
from sqlite3 import Error

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression

# ML

In [356]:
def load_otodom():
    """Load csv files with defined column names"""
    
    data_ads_cols = ["date", "user_id", "ad_id", "category_id", "params"]
    data_replies_cols = ["date", "user_id", "ad_id", "mails", "phones"]
    data_segmentation_cols = ["user_id", "segment"]
    data_categories_cols = ["category_id", "category_name"]

    # here you can find information about the announcements
    data_ads_df = pd.read_csv("data/data_ads.csv", delimiter=";", names=data_ads_cols)
    # information about the response per advertisement per day
    data_replies_df = pd.read_csv("data/data_replies.csv", delimiter=";", names=data_replies_cols)
    # segmentation mapping for each user
    data_segments_df = pd.read_csv("data/data_segments.csv", delimiter=";", names=data_segmentation_cols)
    # mapping to category tree
    data_categories_df = pd.read_csv("data/data_categories.csv", delimiter=";", names=data_categories_cols)
    
    return [data_ads_df, data_replies_df, data_segments_df, data_categories_df]

In [357]:
def check_info(source):
    """Check columns type for each DataFrame"""
    
    print("Checking info: \n")
    
    for df in source:
        print (df.info(), "\n")

In [358]:
def cut_missing(source, column):
    """Cut rows with missing values from original source and make new df with only null values"""
    
    replies = source
    null_array = np.array([]) # Im not sure if its works.
    null_indices =  replies[replies[column].isnull()].index.tolist()
    
    # cutting nulls
    for i in null_indices:
        np.append(null_array, replies.loc[i])
    
    # dropping nulls
    for i in null_indices:
        replies.drop(replies.index[i], inplace=True)
    
    not_null_replies = replies
    
    # new DataFrame with missing values
    null_replies = pd.DataFrame(null_array)
    
    # saving to csv
    null_replies.to_csv("data/null_replies.csv")
    not_null_replies.to_csv("data/not_null_replies.csv")
    
    return [null_replies, not_null_replies]

In [None]:
def load_replies():
    """Load csv files with null/not null replies"""
    
    #data_replies_cols = ["date", "user_id", "ad_id", "mails", "phones"]
    
    # segmentation mapping for each user
    null_replies = pd.read_csv("data/null_replies.csv"#, names=data_replies_cols)
    # mapping to category tree
    not_null_replies = pd.read_csv("data/not_null_replies.csv"#, names=data_replies_cols)
    
    return [null_replies, not_null_replies]

In [359]:
def select_split(source):
    """ """
    
    # features
    X = source.iloc[:,0:4]
    # target
    y = source.iloc[:,4]
    
    # split into train=0.8, test=0.2
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    
    return [X_train, X_test, y_train, y_test]

In [360]:
def check_missing(df_names, *args):
    """ """
    my_list = [*args]
    for df, names in zip(my_list, df_names):
        print(f"Missing in {names} %\n",round(df.isnull().sum()/len(df)*100, 2),"\n")

In [361]:
def best_linear_regression(*args):
    """ """
    # linear regression
    logistic = LogisticRegression(solver="liblinear")
    penalty = ["l1", "l2"]
    C = np.logspace(0, 4, 1000)
    hyperparameters = dict(C=C, penalty=penalty)
    
    # standard scaling
    scaler = StandardScaler().fit(X_train)
    X_train_std = scaler.transform(X_train)
    X_test_std = scaler.transform(X_test)
    
    # randomized search
    randomizedsearch = RandomizedSearchCV(
        logistic,
        hyperparameters,
        random_state=1,
        n_iter=1000,
        cv=5,
        verbose=0,
        n_jobs=-1
    )
    
    best_random_model = randomizedsearch.fit(X_train_std, y_train)
    preds = randomizedsearch.predict(X_test_std)
     
    return preds

In [362]:
def join_replies(source_1, source_2, pred):
    """ """
    
    # join
    replies_1 = source_1
    replies_2 = source_2["phones"] = pred
    data_replies = replies_1.append(replies_2, ignore_index=True)
    
    return data_replies

In [None]:
def ml_job():
    # List of OLX DataFrames
    data = load_otodom()
    # Returns: [data_ads_df, data_replies_df, data_segments_df, data_categories_df]
    
    # DataFrame: data_replies_df
    data_replies_df = data[1]
    
    # Names of splitted DataFrames
    split_names = ["X_train", "X_test", "X_val", "y_train", "y_test", "y_val"]
    
    # Check info
    check_info(source=data)
    # Returns: None
    
     # Try load files from csv or make new
    try:
        null_replies, not_null_replies = load_replies()
    except Exception as e:
           print("Error has occurred: ", e, "\n")
    else:
        # Split original source into two pieces [null/not null] ans save it to csv
        null_replies, not_null_replies = cut_missing(source=data_replies_df, column="phones")
        # Returns: [null_replies, not_null_replies]

    # Select features, target and split it
    X_train, X_test, y_train, y_test = select_split(source=not_null_replies)
    # Retruns:  [X_train, X_test, y_train, y_test]
    
    # Check percent of missing values 
    check_missing(source=splitted, df_names=split_names)
    # Returns: None
    
    # Classification WIP
    preds = best_linear_regression(X_train, X_test, y_train, y_test)
    # Returns: preds
    
    new_data_replies = join_replies(source_1=not_null_replies, source_2=null_replies, pred=preds)
    # Returns data_replies
    
    # Update list of DataFrames
    data.remove(data_replies_df)
    data.append(new_data_replies)
    
    return data

# SQL

In [363]:
def sqlite3_connect(db_file):
    """Establish connection with local database"""
    
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print("Connected to sqlite3 ver: ", sqlite3.version)
    except Error as e:
        print(e)
        
    return conn

In [364]:
def sqlite3_insert(source, tables, connection):
    """Insert data to the database from DataFrames"""
    
    # [data_ads_df, data_replies_df, data_segments_df, data_categories_df]
    indices = [0, 1, 2, 3]
    
    for i, tab in zip(indices, tables):
        source[i].to_sql(tab, connection, if_exists='replace', index=False)


In [365]:
def sqlite3_query(connection, query):
    """Create a table from table_query statement"""
    
    try:
        connection.execute(query)
        print("Query send!")
    except Error as e:
        print(e)

In [366]:
def sql_job(source):
    """ """
    
    database = r"otodom.db"
    table_list = ["ads", "replies", "segments", "categories"]
    
    sql_my_query = " "
    
    conn = sqlite3_connect(database)
    
    if conn is not None:
        sqlite3_insert(source=source, tables=table_list, connection=conn)
        #sqlite3_query(connection=conn, query=sql_my_query)
        #sqlite3_query(connection=conn, query=sql_my_query)
        #sqlite3_query(connection=conn, query=sql_my_query)
        print("SQL job is done.")
    else:
        print("Error! cannot create the database connection.")

# LIQUIDITY

In [367]:
def liquidity_per_user():
    """
    Liquidity will be understood as % of advertisements which have received 
    at least 1 response (by phone or e-mail) within a period of 7 days 
    (including day 0 - the day of adding an day of adding an ad)
    """
    
    pass

In [368]:
def full_data_analysis():
    """ 
    Jupyter/R Markdown preferred for analysis

    Scripts can be in separate files, or as part of a notebook depending on
    selected methods

    Please present your final results and most important conclusions in the 
    form of a presentation (e.g. Google slides)
    """
    
    pass

In [369]:
def question_1():
    """ 
    What differences do you see between the segments in terms of the data 
    you have available (including liquidity)?
    """
    
    pass

In [370]:
def question_2():
    """What do you think might influence higher or lower levels of liquidity?"""
    
    pass

# MAIN

In [371]:
def main():
    
    # Predict missing values with LinearRegression
    data = ml_job()
    
    # Make db, insert data from df
    sql_job(source=data)

In [372]:
if __name__ == '__main__':
    %time main()

IndexError: index 657032 is out of bounds for axis 0 with size 657029