In [45]:
import csv
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale, normalize
from sklearn.model_selection import GridSearchCV

In [46]:
male = pd.read_csv("MALE.csv", header=0, index_col=None)
female = pd.read_csv("FEMALE.csv", header=0, index_col=None)
mix = pd.read_csv("MIXED.csv", header=0, index_col=None)

In [47]:
def split_data(df):
    test_data = df.sample(n=100, random_state=0)
    df = df.drop(test_data.index)
    develop_data = df.sample(n=100, random_state=0)
    train_data = df.drop(develop_data.index)
    return train_data, develop_data, test_data

In [48]:
male_train, male_develop, male_test = split_data(male)
female_train, female_develop, female_test = split_data(female)
mix_train, mix_develop, mix_test = split_data(mix)


In [49]:
def model(X_train, y_train, X_test, y_test, X_target, y_target):
    #X_train, X_test, X_target = normalize(X_train), normalize(X_test), normalize(X_target)
    X_train, X_test, X_target = scale(X_train), scale(X_test), scale(X_target)

    lr = LinearRegression(n_jobs=-1, normalize=True)
    lr.fit(X_train, y_train)
    print("score of Linear Regression:", lr.score(X_test, y_test))
    y_pred = lr.predict(X_target).round(0).astype(int)
    print('Mean Squared Error for Linear Regression:', metrics.mean_squared_error(y_target, y_pred))  
    print("accuracy score for Linear Regression:", metrics.accuracy_score(y_target, y_pred))

    mlp = MLPClassifier(max_iter=100, solver='sgd', learning_rate='constant', activation='tanh',)
    parameter_space = {
        'hidden_layer_sizes': [(50,50,50), (50,100,50), (100, 200, 100), (50, 300, 100)],
        'alpha': [0.0005, 0.0025, 0.001, 0.05],
    }
    #MLP with GridSearch
    clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5)
    clf.fit(X_train, y_train)
    # Best paramete set
    print('Best parameters found:\n', clf.best_params_) 
    '''
    # All results
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    #mlp.fit(X_train, y_train)
    '''    
    print("score of MLP:", clf.score(X_test, y_test))
    y_pred = clf.predict(X_target).round(0).astype(int)
    print('Mean Squared Error for MLP:', metrics.mean_squared_error(y_target, y_pred))  
    print("accuracy score for MLP:", metrics.accuracy_score(y_target, y_pred))

In [50]:
'''
SRCONLY:
The SRCONLY baseline ignores the target data and
trains a single model, only on the source data.

source:male+mix target:female
'''
print("SRCONLY: target is female")
source_train = pd.concat([male_train, mix_train])
source_develop = pd.concat([male_develop, mix_develop])
target = female_test
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
X_test, y_test = source_develop.drop(['Exam Score'], axis=1), source_develop['Exam Score']
X_target, y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']

model(X_train, y_train, X_test, y_test, X_target, y_target)



SRCONLY: target is female
score of Linear Regression: 0.08100271257476022
Mean Squared Error for Linear Regression: 166.31
accuracy score for Linear Regression: 0.03
Best parameters found:
 {'alpha': 0.05, 'hidden_layer_sizes': (50, 100, 50)}
score of MLP: 0.045
Mean Squared Error for MLP: 227.14
accuracy score for MLP: 0.03


In [51]:
'''
TGTONLY:
The TGTONLY baseline trains a single model only
on the target data.

source:female target:female
'''
print("TGTONLY: target is female")
source_train = female_test 
source_develop = female_develop
target = female_test
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
X_test, y_test = source_develop.drop(['Exam Score'], axis=1), source_develop['Exam Score']
X_target, y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']

model(X_train, y_train, X_test, y_test, X_target, y_target)


TGTONLY: target is female
score of Linear Regression: -0.08887180005753792
Mean Squared Error for Linear Regression: 146.89
accuracy score for Linear Regression: 0.02
Best parameters found:
 {'alpha': 0.05, 'hidden_layer_sizes': (50, 50, 50)}
score of MLP: 0.01
Mean Squared Error for MLP: 208.22
accuracy score for MLP: 0.12


In [52]:
'''
ALL:
The ALL baseline simply trains a standard learning
algorithm on the union of the two datasets

source:male+female+mix target:female
'''
print("ALL: target is female")
source_train = pd.concat([male_train, female_test, mix_train])
source_develop = pd.concat([male_develop, female_develop, mix_develop])
target = female_test
X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
X_test, y_test = source_develop.drop(['Exam Score'], axis=1), source_develop['Exam Score']
X_target, y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']

model(X_train, y_train, X_test, y_test, X_target, y_target)



ALL: target is female
score of Linear Regression: 0.06930068664928313
Mean Squared Error for Linear Regression: 166.1
accuracy score for Linear Regression: 0.03
Best parameters found:
 {'alpha': 0.0025, 'hidden_layer_sizes': (100, 200, 100)}
score of MLP: 0.023333333333333334
Mean Squared Error for MLP: 276.98
accuracy score for MLP: 0.03


In [55]:
'''
WEIGHTED:
For instance, if N = 10 x M, we may weight each example from the source domain by 0:1. 
The next baseline, WEIGHTED, is exactly this approach, with the weight chosen by cross-validation.

source:male+female+mix target:female
'''
print("WEIGHTED: target is female")
orginal_n_samples = len(pd.concat([male_train, female_test, mix_train]))
source_train = pd.concat([male_train, mix_train, female_test, female_test, female_test]).sample(n=orginal_n_samples, random_state=0)
source_develop = pd.concat([male_develop, female_develop, mix_develop])
target = female_test

X_train, y_train = source_train.drop(['Exam Score'], axis=1), source_train['Exam Score']
X_test, y_test = source_develop.drop(['Exam Score'], axis=1), source_develop['Exam Score']
X_target, y_target = target.drop(['Exam Score'], axis=1), target['Exam Score']

model(X_train, y_train, X_test, y_test, X_target, y_target)


WEIGHTED: target is female
score of Linear Regression: 0.07036395772482773
Mean Squared Error for Linear Regression: 165.71
accuracy score for Linear Regression: 0.03
Best parameters found:
 {'alpha': 0.0025, 'hidden_layer_sizes': (50, 300, 100)}
score of MLP: 0.03
Mean Squared Error for MLP: 248.9
accuracy score for MLP: 0.05


In [None]:
'''
PRED:
using the output of the source classifier as a feature in the target classifier. 
Specifically, we first train a SRCONLY model. Then we run the SRCONLY model 
on the target data (training, development and test). We use the predictions 
made by the SRCONLY model as additional features and
train a second model on the target data, augmented with this new feature.

source:male+female+mix target:female
'''
print("PRED: target is female")



In [54]:
'''
http://users.umiacs.umd.edu/~hal/docs/daume07easyadapt.odp.pdf

https://www.aclweb.org/anthology/N18-2076.pdf

https://slideplayer.com/slide/10770963/


http://mlreference.com/feedforward-neural-network-sklearn
'''

'\nhttp://users.umiacs.umd.edu/~hal/docs/daume07easyadapt.odp.pdf\n\nhttps://www.aclweb.org/anthology/N18-2076.pdf\n\nhttps://slideplayer.com/slide/10770963/\n\n\nhttp://mlreference.com/feedforward-neural-network-sklearn\n'