# Interface between PostgreSQL DB and Pandas

In [2]:
import os

In [3]:
data_dir = '/home/chan/workspace/datasets/speed-dating/'
data_file_path = os.path.join(data_dir, 'Speed Dating Data.csv')

In [6]:
import csv
csv_file = open(data_file_path, 'r', encoding='latin_1')
csv_reader = csv.DictReader(csv_file)

In [7]:
data_keys = {
    "iid": "INTEGER",
    "gender": "INTEGER",
    "round": "INTEGER",
    "attr": "FLOAT",
    "sinc": "FLOAT",
    "intel": "FLOAT",
    "fun": "FLOAT",
    "amb": "FLOAT",
    "dec": "FLOAT",
}

In [8]:
import psycopg2

In [15]:
ks = [k for k in data_keys.keys()]

with psycopg2.connect("dbname=speed-dating user=chan") as conn:
    with conn.cursor() as cur:
        cur.execute("DROP TABLE IF EXISTS data")
        cur.execute(
            "CREATE TABLE data (obs_id SERIAL PRIMARY KEY, {});"
            .format(
                ', '.join(
                    ['{} {}'.format(k, v) for k, v in data_keys.items()]
                )
            )
        )
        for an_obs in csv_reader:
            cur.execute(
                "INSERT INTO data ({}) VALUES ({});"
                .format(
                    ', '.join(['{}'.format(k) for k in ks]),
                    ', '.join(['%s'] * len(ks)),
                ),
                [an_obs[k] if an_obs[k] != '' else None for k in ks],
            )

        conn.commit()

In [16]:
import pandas as pd
conn = psycopg2.connect("dbname=speed-dating user=chan")
df = pd.read_sql(
    "SELECT {} FROM data;"
    .format(
        ', '.join(['{}'.format(k) for k in data_keys.keys()])
    ),
    conn
)

In [17]:
df.sample(10)

Unnamed: 0,obs_id,iid,gender,round,attr,sinc,intel,fun,amb,dec
2152,2150,167,1,16,3.0,7.0,6.0,4.0,5.0,0.0
7808,7809,533,1,22,6.0,8.0,8.0,7.0,7.0,0.0
1632,1630,125,1,10,8.0,8.0,7.0,7.0,7.0,1.0
3479,3479,254,0,21,5.0,6.0,6.0,3.0,3.0,0.0
6619,6620,450,1,11,8.0,10.0,10.0,7.0,9.0,0.0
7855,7856,535,1,22,,,,,,0.0
19,20,17,1,10,6.0,4.0,7.0,5.0,6.0,0.0
3791,3791,269,0,21,6.0,8.0,8.0,7.0,7.0,0.0
5963,5964,398,1,18,5.0,8.0,7.0,4.0,,0.0
6636,6637,452,1,11,6.0,8.0,7.0,6.0,6.0,1.0


In [19]:
df[
    pd.isna(df['attr'])
    | pd.isna(df['intel'])
    | pd.isna(df['amb'])
    | pd.isna(df['sinc'])
    | pd.isna(df['fun'])
].shape

(815, 10)

In [20]:
attributes = ['attr', 'intel', 'amb', 'sinc', 'fun']
df = df.dropna(axis='index', subset=attributes)

# Simple models

In [21]:
X_f = df[df['gender'] == 0][['iid', 'round'] + attributes]
y_f = df[df['gender'] == 0]['dec']

X_m = df[df['gender'] == 1][['iid', 'round'] + attributes]
y_m = df[df['gender'] == 1]['dec']

In [22]:
X_f['sample_weight'] = 1 / X_f['round']
X_m['sample_weight'] = 1 / X_m['round']

In [36]:
import numpy as np
import sklearn
from sklearn import model_selection
from sklearn import linear_model

## A simple linear regression

Here we perform a very simple regression to estimate the decision according to p.682 of http://faculty.chicagobooth.edu/emir.kamenica/documents/genderDifferences.pdf

### Female, gender == 0

In [26]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_f, y_f, test_size=0.25)

In [29]:
reg = linear_model.LinearRegression()

In [33]:
reg.fit(X_train[attributes], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [34]:
reg.coef_

array([ 0.07621334,  0.01851753, -0.01172092, -0.01213269,  0.05510978])

In [37]:
y_pred = np.heaviside(reg.predict(X_test[attributes]), 0.5)

In [38]:
sklearn.metrics.mean_squared_error(y_test, y_pred)

0.5639344262295082

### Male, gender == 1

In [43]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_m, y_m, test_size=0.25)

In [41]:
reg = linear_model.LinearRegression()
reg.fit(X_train[attributes], y_train)
reg.coef_

array([ 0.11908156, -0.00253862, -0.01297444, -0.02325008,  0.05950302])

In [42]:
y_pred = np.heaviside(reg.predict(X_test[attributes]), 0.5)

sklearn.metrics.mean_squared_error(y_test, y_pred)

0.48136315228966986

## Logistic regression

### Female

In [44]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_f, y_f, test_size=0.25)

## No sample weight

In [45]:
lrc = linear_model.LogisticRegression(class_weight='balanced')
lrc.fit(X_train[attributes], y_train)
lrc.coef_

array([[ 0.4376902 ,  0.12893254, -0.13773595, -0.08615458,  0.41068   ]])

In [46]:
y_pred = lrc.predict(X_test[attributes])
sklearn.metrics.mean_squared_error(y_test, y_pred)

0.28743169398907104

## With sample weight

In [48]:
lrc = linear_model.LogisticRegression(class_weight='balanced')
lrc.fit(X_train[attributes], y_train, sample_weight=X_train['sample_weight'])
lrc.coef_

array([[ 0.34195099,  0.00260929, -0.1658326 , -0.11679748,  0.36857687]])

In [49]:
y_pred = lrc.predict(X_test[attributes])
sklearn.metrics.mean_squared_error(y_test, y_pred)

0.3092896174863388

## Male

In [50]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_m, y_m, test_size=0.25)

## No sample weight

In [51]:
lrc = linear_model.LogisticRegression(class_weight='balanced')
lrc.fit(X_train[attributes], y_train)
lrc.coef_

array([[ 0.65983461, -0.06312565, -0.08619471, -0.14891546,  0.36892042]])

In [52]:
y_pred = lrc.predict(X_test[attributes])
sklearn.metrics.mean_squared_error(y_test, y_pred)

0.24813631522896698

## With sample weight

In [53]:
lrc = linear_model.LogisticRegression(class_weight='balanced')
lrc.fit(X_train[attributes], y_train, sample_weight=X_train['sample_weight'])
lrc.coef_

array([[ 0.57067604, -0.18369957, -0.09390644, -0.19735431,  0.32446958]])

In [54]:
y_pred = lrc.predict(X_test[attributes])
sklearn.metrics.mean_squared_error(y_test, y_pred)

0.268370607028754