In [19]:
import sqlite3
from sqlite3 import Error

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [20]:
def load_otodom():
    """Load csv files with defined column names"""
    
    data_ads_cols = ["date", "user_id", "ad_id", "category_id", "params"]
    data_replies_cols = ["date", "user_id", "ad_id", "mails", "phones"]
    data_segmentation_cols = ["user_id", "segment"]
    data_categories_cols = ["category_id", "category_name"]

    # here you can find information about the announcements
    data_ads_df = pd.read_csv("data/data_ads.csv", delimiter=";", names=data_ads_cols)
    # information about the response per advertisement per day
    data_replies_df = pd.read_csv("data/data_replies.csv", delimiter=";", names=data_replies_cols)
    # segmentation mapping for each user
    data_segments_df = pd.read_csv("data/data_segments.csv", delimiter=";", names=data_segmentation_cols)
    # mapping to category tree
    data_categories_df = pd.read_csv("data/data_categories.csv", delimiter=";", names=data_categories_cols)
    
    return [data_ads_df, data_replies_df, data_segments_df, data_categories_df]

In [21]:
def check_dtypes(source):
    """Check columns type for each DataFrame"""
    
    print("Checking data types: \n")
    
    for df in source:
        print (df.dtypes, "\n")

In [22]:
def check_null(source):
    """Check columns for null values"""
    
    print("Checking nulls in data: \n")
    
    for df in source:
        print(round(df.isnull().sum().sort_values(ascending=False)/len(df)*100, 2), "\n")

# SQL

In [23]:
def sqlite3_connect(db_file):
    """Establish connection with local database"""
    
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print(sqlite3.version)
    except Error as e:
        print(e)
        
    return conn

In [24]:
def sqlite3_insert(source, tables, connection):
    """Insert data to the database from DataFrames"""
    
    # [data_ads_df, data_replies_df, data_segments_df, data_categories_df]
    indices = [0, 1, 2, 3]
    
    for i, tab in zip(indices, tables):
        source[i].to_sql(tab, connection, if_exists='replace', index=False)


In [25]:
def sqlite3_query(connection, query):
    """Create a table from table_query statement"""
    
    try:
        connection.execute(query)
    except Error as e:
        print(e)

In [26]:
def sql_job(source):
    """ """
    
    database = r"otodom.db"
    table_list = ["ads", "replies", "segments", "categories"]
    
    sql_my_query = " "
    
    conn = sqlite3_connect(database)
    
    if conn is not None:
        sqlite3_insert(source=source, tables=table_list, connection=conn)
        #sqlite3_query(connection=conn, query=sql_my_query)
    else:
        print("Error! cannot create the database connection.")
    

# LIQUIDITY

In [27]:
def liquidity_per_user():
    """
    Liquidity will be understood as % of advertisements which have received 
    at least 1 response (by phone or e-mail) within a period of 7 days 
    (including day 0 - the day of adding an day of adding an ad)
    """
    
    pass

In [28]:
def full_data_analysis():
    """ 
    Jupyter/R Markdown preferred for analysis

    Scripts can be in separate files, or as part of a notebook depending on
    selected methods

    Please present your final results and most important conclusions in the 
    form of a presentation (e.g. Google slides)
    """
    
    pass

In [29]:
def question_1():
    """ 
    What differences do you see between the segments in terms of the data 
    you have available (including liquidity)?
    """
    
    pass

In [30]:
def question_2():
    """What do you think might influence higher or lower levels of liquidity?"""
    
    pass

In [31]:
def main():
    data = load_otodom()
    
    check_dtypes(source=data)
    check_null(source=data)
    
    sql_job(source=data)

In [32]:
if __name__ == '__main__':
    main()

Checking data types: 

date           object
user_id         int64
ad_id           int64
category_id     int64
params         object
dtype: object 

date        object
user_id      int64
ad_id        int64
mails        int64
phones     float64
dtype: object 

user_id     int64
segment    object
dtype: object 

category_id       int64
category_name    object
dtype: object 

Checking nulls in data: 

date           0.0
user_id        0.0
ad_id          0.0
category_id    0.0
params         0.0
dtype: float64 

phones     14.64
date        0.00
user_id     0.00
ad_id       0.00
mails       0.00
dtype: float64 

user_id    0.0
segment    0.0
dtype: float64 

category_id      0.0
category_name    0.0
dtype: float64 

2.6.0
