In [None]:
import csv
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale, normalize
from sklearn.model_selection import GridSearchCV

In [42]:
male = pd.read_csv("MALE.csv", header=0, index_col=None)
female = pd.read_csv("FEMALE.csv", header=0, index_col=None)
mix = pd.read_csv("MIXED.csv", header=0, index_col=None)

In [43]:
def split_data(df):
    test_data = df.sample(n=100, random_state=0)
    df = df.drop(test_data.index)
    develop_data = df.sample(n=100, random_state=0)
    train_data = df.drop(develop_data.index)
    return train_data, develop_data, test_data

In [44]:
male_train, male_develop, male_test = split_data(male)
female_train, female_develop, female_test = split_data(female)
mix_train, mix_develop, mix_test = split_data(mix)


In [45]:
def model(X_train, y_train, X_test, y_test, X_target, y_target):
    #X_train, X_test, X_target = normalize(X_train), normalize(X_test), normalize(X_target)
    #X_train, X_test, X_target = scale(X_train), scale(X_test), scale(X_target)

    lr = LinearRegression(n_jobs=-1, normalize=True)
    lr.fit(X_train, y_train)
    print("score of Linear Regression:", lr.score(X_test, y_test))
    y_pred = lr.predict(X_target).round(0).astype(int)
    print('Mean Squared Error for Linear Regression:', metrics.mean_squared_error(y_target, y_pred))  
    print("accuracy score for Linear Regression:", metrics.accuracy_score(y_target, y_pred))

    mlp = MLPClassifier(max_iter=100, solver='sgd', learning_rate='constant', activation='tanh',)
    parameter_space = {
        'hidden_layer_sizes': [(50,50,50), (50,100,50), (100, 200, 100), (50, 300, 100)],
        'alpha': [0.0005, 0.0025, 0.001, 0.05],
    }
    #MLP with GridSearch
    clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5)
    clf.fit(X_train, y_train)
    # Best paramete set
    print('Best parameters found:\n', clf.best_params_) 
    '''
    # All results
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    #mlp.fit(X_train, y_train)
    '''    
    print("score of MLP:", clf.score(X_test, y_test))
    y_pred = clf.predict(X_target).round(0).astype(int)
    print('Mean Squared Error for MLP:', metrics.mean_squared_error(y_target, y_pred))  
    print("accuracy score for MLP:", metrics.accuracy_score(y_target, y_pred))

In [46]:
'''
SRCONLY:
The SRCONLY baseline ignores the target data and
trains a single model, only on the source data.

source: male+mix
target: female
'''
print("SRCONLY: target is female")
source_train = pd.concat([male_train, mix_train])
source_develop = pd.concat([male_develop, mix_develop])
target = female_test
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
X_test, y_test = source_develop.drop(['Exam Score'], axis=1), source_develop['Exam Score']
X_target, y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']

model(X_train, y_train, X_test, y_test, X_target, y_target)



SRCONLY: target is female
score of Linear Regression: 0.07989055357717878
Mean Squared Error for Linear Regression: 161.03
accuracy score for Linear Regression: 0.05
Best parameters found:
 {'alpha': 0.05, 'hidden_layer_sizes': (100, 200, 100)}
score of MLP: 0.055
Mean Squared Error for MLP: 242.81
accuracy score for MLP: 0.01


In [47]:
'''
TGTONLY:
The TGTONLY baseline trains a single model only
on the target data.

source: female
target: female
'''
print("TGTONLY: target is female")
source_train = female_test 
source_develop = female_develop
target = female_test
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
X_test, y_test = source_develop.drop(['Exam Score'], axis=1), source_develop['Exam Score']
X_target, y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']

model(X_train, y_train, X_test, y_test, X_target, y_target)


TGTONLY: target is female
score of Linear Regression: -0.11374860922435559
Mean Squared Error for Linear Regression: 146.89
accuracy score for Linear Regression: 0.02
Best parameters found:
 {'alpha': 0.0005, 'hidden_layer_sizes': (100, 200, 100)}
score of MLP: 0.02
Mean Squared Error for MLP: 161.35
accuracy score for MLP: 0.14


In [48]:
'''
ALL:
The ALL baseline simply trains a standard learning
algorithm on the union of the two datasets

source: male+female+mix
target: female
'''
print("ALL: target is female")
source_train = pd.concat([male_train, female_test, mix_train])
source_develop = pd.concat([male_develop, female_develop, mix_develop])
target = female_test
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
X_test, y_test = source_develop.drop(['Exam Score'], axis=1), source_develop['Exam Score']
X_target,y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']
 
model(X_train, y_train, X_test, y_test, X_target, y_target)



ALL: target is female
score of Linear Regression: 0.07280278281780084
Mean Squared Error for Linear Regression: 160.57
accuracy score for Linear Regression: 0.05


KeyboardInterrupt: 

In [9]:
'''
WEIGHTED:
For instance, if N = 10 x M, we may weight each example from the source domain by 0:1. 
The next baseline, WEIGHTED, is exactly this approach, with the weight chosen by cross-validation.

source: male+female+mix
target: female
'''
print("WEIGHTED: target is female")
orginal_n_samples = len(pd.concat([male_train, female_test, mix_train]))
source_train = pd.concat([male_train, mix_train, female_test, female_test, female_test]).sample(n=orginal_n_samples, random_state=0)
source_develop = pd.concat([male_develop, female_develop, mix_develop])
target = female_test

X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
X_test, y_test = source_develop.drop(['Exam Score'], axis=1), source_develop['Exam Score']
X_target, y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']

model(X_train, y_train, X_test, y_test, X_target, y_target)


WEIGHTED: target is female
score of Linear Regression: 0.07036395772482773
Mean Squared Error for Linear Regression: 165.71
accuracy score for Linear Regression: 0.03
Best parameters found:
 {'alpha': 0.05, 'hidden_layer_sizes': (100, 200, 100)}
score of MLP: 0.02
Mean Squared Error for MLP: 233.58
accuracy score for MLP: 0.04


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=True),
 GridSearchCV(cv=5, error_score=nan,
              estimator=MLPClassifier(activation='tanh', alpha=0.0001,
                                      batch_size='auto', beta_1=0.9,
                                      beta_2=0.999, early_stopping=False,
                                      epsilon=1e-08, hidden_layer_sizes=(100,),
                                      learning_rate='constant',
                                      learning_rate_init=0.001, max_fun=15000,
                                      max_iter=100, momentum=0.9,
                                      n_iter_no_change=10,
                                      nesterovs_momentum=True, power_t=0.5,
                                      random_state=None, shuffle=True,
                                      solver='sgd', tol=0.0001,
                                      validation_fraction=0.1, verbose=False,
                                 

In [49]:
'''
PRED:
using the output of the source classifier as a feature in the target classifier. 
Specifically, we first train a SRCONLY model. Then we run the SRCONLY model 
on the target data (training, development and test). We use the predictions 
made by the SRCONLY model as additional features and
train a second model on the target data, augmented with this new feature.

source: prdictions in SRCONLY + target data
target: female
'''
print("PRED: prdictions in SRCONLY Linear Regression")
source_train = pd.concat([male_train, mix_train])
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
#X_train, X_test, X_target = scale(X_train), scale(X_test), scale(X_target)
src_lr = LinearRegression(n_jobs=-1, normalize=True)
src_lr.fit(X_train, y_train)

#linear regression
print("PRED: target is female Linear Regression")
source_train = female_train
source_develop = female_develop
target = female_test

X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
X_test, y_test = source_develop.drop(['Exam Score'], axis=1), source_develop['Exam Score']
X_target, y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']

X_train['srconly_feature'] = src_lr.predict(X_train).round(0).astype(int)
X_test['srconly_feature'] = src_lr.predict(X_test).round(0).astype(int)
X_target['srconly_feature'] = src_lr.predict(X_target).round(0).astype(int)

#X_train, X_test, X_target = scale(X_train), scale(X_test), scale(X_target)
lr = LinearRegression(n_jobs=-1, normalize=True)
lr.fit(X_train, y_train)
print("score of Linear Regression:", lr.score(X_test, y_test))
y_pred = lr.predict(X_target).round(0).astype(int)
print('Mean Squared Error for Linear Regression:', metrics.mean_squared_error(y_target, y_pred))  
print("accuracy score for Linear Regression:", metrics.accuracy_score(y_target, y_pred))

#mlp
print("PRED: prdictions in SRCONLY MLP")
source_train = pd.concat([male_train, mix_train])
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
mlp = MLPClassifier(max_iter=100, solver='sgd', learning_rate='constant', activation='tanh',)
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100, 200, 100), (50, 300, 100)],
    'alpha': [0.0005, 0.0025, 0.001, 0.05],
}
scr_clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5)
scr_clf.fit(X_train, y_train)

print("PRED: target is female MLP Classifier")
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
X_test, y_test = source_develop.drop(['Exam Score'], axis=1), source_develop['Exam Score']
X_target, y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']

X_train['srconly_feature'] = scr_clf.predict(X_train).round(0).astype(int)
X_test['srconly_feature'] = scr_clf.predict(X_test).round(0).astype(int)
X_target['srconly_feature'] = scr_clf.predict(X_target).round(0).astype(int)

#X_train, X_test, X_target = scale(X_train), scale(X_test), scale(X_target)
mlp = MLPClassifier(max_iter=100, solver='sgd', learning_rate='constant', activation='tanh',)
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100, 200, 100), (50, 300, 100)],
    'alpha': [0.0005, 0.0025, 0.001, 0.05],
}
#MLP with GridSearch
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)
# Best paramete set
print('Best parameters found:\n', clf.best_params_)  
print("score of MLP:", clf.score(X_test, y_test))
y_pred = clf.predict(X_target).round(0).astype(int)
print('Mean Squared Error for MLP:', metrics.mean_squared_error(y_target, y_pred))  
print("accuracy score for MLP:", metrics.accuracy_score(y_target, y_pred))

PRED: target is female Linear Regression
score of Linear Regression: 0.0293595878923838
Mean Squared Error for Linear Regression: 158.21
accuracy score for Linear Regression: 0.01
PRED: target is female MLP Classifier


KeyboardInterrupt: 

In [40]:
"""
LININT:
linearly interpolate the predictions of the SRCONLY 
and the TGTONLY models. The interpolation parameter is
adjusted based on target development data.

source: prdictions in SRCONLY + prdictions in TGTONLY
target: female
"""
target = female_test
X_target, y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']

print("LININT: prdictions in SRCONLY Linear Regression")
source_train = pd.concat([male_train, mix_train])
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
#X_train, X_test, X_target = scale(X_train), scale(X_test), scale(X_target)
src_lr = LinearRegression(n_jobs=-1, normalize=True)
src_lr.fit(X_train, y_train)

print("LININT: prdictions in TGTONLY Linear Regression")
source_train = female_test
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
#X_train, X_test, X_target = scale(X_train), scale(X_test), scale(X_target)
tgt_lr = LinearRegression(n_jobs=-1, normalize=True)
tgt_lr.fit(X_train, y_train)

'''linear regression'''
srconly_lr_y_pred = src_lr.predict(X_target).round(0).astype(int)
tgtonly_lr_y_pred = tgt_lr.predict(X_target).round(0).astype(int)

df = pd.DataFrame()
df["srconly"] = srconly_lr_y_pred
df["tgtonly"] = tgtonly_lr_y_pred

w , mse = 0, min(metrics.mean_squared_error(y_target, srconly_lr_y_pred), metrics.mean_squared_error(y_target, tgtonly_lr_y_pred)) 
#tuning the weight hyperparameter
for i in range (1, 100):
    new_w = i/100
    new_linint = df["srconly"] + new_w * (df["tgtonly"] - df["srconly"]) 
    #new_linint = df["srconly"] * new_w + df["tgtonly"] * (1 - new_w) # method 2 https://arxiv.org/pdf/1312.6204.pdf
    new_mse = metrics.mean_squared_error(y_target, new_linint)
    if new_mse < mse :
        w , mse = new_w, new_mse

print("weight is equal to", w)
df['pred'] = df["srconly"] + w * (df["tgtonly"] - df["srconly"]) 
#linint = df["srconly"] * w + df["tgtonly"] * (1 - w)
print('Mean Squared Error for linear regression:', metrics.mean_squared_error(y_target, df['pred'].round(0).astype(int)))
print("accuracy score for linear regression:", metrics.accuracy_score(y_target, df['pred'].round(0).astype(int)))

'''MLP'''
print("LININT: prdictions in SRCONLY MLP")
source_train = pd.concat([male_train, mix_train])
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
mlp = MLPClassifier(max_iter=100, solver='sgd', learning_rate='constant', activation='tanh',)
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100, 200, 100), (50, 300, 100)],
    'alpha': [0.0005, 0.0025, 0.001, 0.05],
}
scr_clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5)
scr_clf.fit(X_train, y_train)

print("LININT: prdictions in TGTONLY MLP")
source_train = female_test
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
mlp = MLPClassifier(max_iter=100, solver='sgd', learning_rate='constant', activation='tanh',)
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100, 200, 100), (50, 300, 100)],
    'alpha': [0.0005, 0.0025, 0.001, 0.05],
}
tgt_clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5)
tgt_clf.fit(X_train, y_train)

srconly_mlp_y_pred = scr_clf.predict(X_target).round(0).astype(int)
tgtonly_mlp_y_pred = tgt_clf.predict(X_target).round(0).astype(int)

df = pd.DataFrame()
df["srconly"] = srconly_mlp_y_pred
df["tgtonly"] = tgtonly_mlp_y_pred

w , mse = 0, min(metrics.mean_squared_error(y_target, srconly_mlp_y_pred), metrics.mean_squared_error(y_target, tgtonly_mlp_y_pred)) 
#tuning the weight hyperparameter
for i in range (1, 100):
    new_w = i/100
    new_linint = df["srconly"] + new_w * (df["tgtonly"] - df["srconly"]) 
    #new_linint = df["srconly"] * new_w + df["tgtonly"] * (1 - new_w) # method 2 https://arxiv.org/pdf/1312.6204.pdf
    new_mse = metrics.mean_squared_error(y_target, new_linint)
    if new_mse < mse :
        w , mse = new_w, new_mse

print("weight is equal to", w)
df['pred'] = df["srconly"] + w * (df["tgtonly"] - df["srconly"]) 
#linint = df["srconly"] * w + df["tgtonly"] * (1 - w)
print('Mean Squared Error for MLP:', metrics.mean_squared_error(y_target, df['pred'].round(0).astype(int)))
print("accuracy score for MLP:", metrics.accuracy_score(y_target, df['pred'].round(0).astype(int)))

LININT: prdictions in SRCONLY Linear Regression
LININT: prdictions in TGTONLY Linear Regression
weight is equal to 0.98
Mean Squared Error for linear regression: 146.89
accuracy score for linear regression: 0.02
weight is equal to 0
Mean Squared Error for MLP: 411.12
accuracy score for MLP: 0.02


In [12]:
'''
http://users.umiacs.umd.edu/~hal/docs/daume07easyadapt.odp.pdf

https://www.aclweb.org/anthology/N18-2076.pdf

https://slideplayer.com/slide/10770963/


http://mlreference.com/feedforward-neural-network-sklearn
'''

'\nhttp://users.umiacs.umd.edu/~hal/docs/daume07easyadapt.odp.pdf\n\nhttps://www.aclweb.org/anthology/N18-2076.pdf\n\nhttps://slideplayer.com/slide/10770963/\n\n\nhttp://mlreference.com/feedforward-neural-network-sklearn\n'