In [258]:
import sqlite3
from sqlite3 import Error

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [259]:
def load_otodom():
    """Load csv files with defined column names"""
    
    data_ads_cols = ["date", "user_id", "ad_id", "category_id", "params"]
    data_replies_cols = ["date", "user_id", "ad_id", "mails", "phones"]
    data_segmentation_cols = ["user_id", "segment"]
    data_categories_cols = ["category_id", "category_name"]

    # here you can find information about the announcements
    data_ads_df = pd.read_csv("data/data_ads.csv", delimiter=";", names=data_ads_cols)
    # information about the response per advertisement per day
    data_replies_df = pd.read_csv("data/data_replies.csv", delimiter=";", names=data_replies_cols)
    # segmentation mapping for each user
    data_segments_df = pd.read_csv("data/data_segments.csv", delimiter=";", names=data_segmentation_cols)
    # mapping to category tree
    data_categories_df = pd.read_csv("data/data_categories.csv", delimiter=";", names=data_categories_cols)
    
    return [data_ads_df, data_replies_df, data_segments_df, data_categories_df]

In [260]:
def check_info(source):
    """Check columns type for each DataFrame"""
    
    print("Checking data types: \n")
    
    for df in source:
        print (df.info(), "\n")

In [261]:
def select_split(source):
    """ """
    
    # features
    X = source.iloc[:,0:4]
    # target
    y = source.iloc[:,4]
    
    # split into train=0.8, test=0.2, val=0.25 (is 0.2 of splited train)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
    
    return [X_train, X_test, X_val, y_train, y_test, y_val]

In [262]:
def check_missing(source, df_names):
    """ """
    
    for df, names in zip(source, df_names):
        print(f"Missing in {names} %\n",round(df.isnull().sum()/len(df)*100, 2),"\n")

In [263]:
def handle_missing(source):
    """ """
    
    pass

# SQL

In [264]:
def sqlite3_connect(db_file):
    """Establish connection with local database"""
    
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print("Connected to sqlite3 ver: ", sqlite3.version)
    except Error as e:
        print(e)
        
    return conn

In [265]:
def sqlite3_insert(source, tables, connection):
    """Insert data to the database from DataFrames"""
    
    # [data_ads_df, data_replies_df, data_segments_df, data_categories_df]
    indices = [0, 1, 2, 3]
    
    for i, tab in zip(indices, tables):
        source[i].to_sql(tab, connection, if_exists='replace', index=False)


In [266]:
def sqlite3_query(connection, query):
    """Create a table from table_query statement"""
    
    try:
        connection.execute(query)
        print("Query send!")
    except Error as e:
        print(e)

In [267]:
def sql_job(source):
    """ """
    
    database = r"otodom.db"
    table_list = ["ads", "replies", "segments", "categories"]
    
    sql_my_query = " "
    
    conn = sqlite3_connect(database)
    
    if conn is not None:
        sqlite3_insert(source=source, tables=table_list, connection=conn)
        #sqlite3_query(connection=conn, query=sql_my_query)
        print("SQL job is done.")
    else:
        print("Error! cannot create the database connection.")

# LIQUIDITY

In [268]:
def liquidity_per_user():
    """
    Liquidity will be understood as % of advertisements which have received 
    at least 1 response (by phone or e-mail) within a period of 7 days 
    (including day 0 - the day of adding an day of adding an ad)
    """
    
    pass

In [269]:
def full_data_analysis():
    """ 
    Jupyter/R Markdown preferred for analysis

    Scripts can be in separate files, or as part of a notebook depending on
    selected methods

    Please present your final results and most important conclusions in the 
    form of a presentation (e.g. Google slides)
    """
    
    pass

In [270]:
def question_1():
    """ 
    What differences do you see between the segments in terms of the data 
    you have available (including liquidity)?
    """
    
    pass

In [271]:
def question_2():
    """What do you think might influence higher or lower levels of liquidity?"""
    
    pass

In [272]:
def main():
    # List of OLX DataFrames
    data = load_otodom()
    # Returns: [data_ads_df, data_replies_df, data_segments_df, data_categories_df]
    
    # DataFrame: data_replies_df
    replies = data[1]
    
    # Names of splitted DataFrames
    split_names = ["X_train", "X_test", "X_val", "y_train", "y_test", "y_val"]
    
    # Check info
    check_info(source=data)
    # Returns: None
    
    # Select features, target and split it
    splitted = select_split(source=replies)
    # Retruns:  [X_train, X_test, X_val, y_train, y_test, y_val]
    
    # Check percent of missing values 
    check_missing(source=splitted, df_names=split_names)
    # Returns: None
    
    # Classification WIP
    #handle_missing(source=replies)
    
    # Make db, insert data from df
    sql_job(source=data)

In [273]:
if __name__ == '__main__':
    main()

Checking data types: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5546 entries, 0 to 5545
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         5546 non-null   object
 1   user_id      5546 non-null   int64 
 2   ad_id        5546 non-null   int64 
 3   category_id  5546 non-null   int64 
 4   params       5546 non-null   object
dtypes: int64(3), object(2)
memory usage: 216.8+ KB
None 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753277 entries, 0 to 753276
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   date     753277 non-null  object 
 1   user_id  753277 non-null  int64  
 2   ad_id    753277 non-null  int64  
 3   mails    753277 non-null  int64  
 4   phones   643009 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 28.7+ MB
None 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405 entries, 0 to 404
