In [1]:
import numpy as np
import pandas as pd
import glob
import xgboost
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
import matplotlib.pyplot as plt

In [2]:
mens = []
womens = []
labels = pd.read_csv('./gender_labels.csv')
for s in glob.glob('/neuro/notebooks/all_data_confounds_remove/*.csv'):
    person = int(s.split('/')[-1].split('_')[0])
    data = pd.read_csv(s)
    data = data.rolling(window=10).mean().dropna()
    if labels[labels['person']==person]['gender'].values[0]=='M':
        mens.append(data)
    else:
        womens.append(data)
mens = pd.concat(mens)
womens = pd.concat(womens)  

In [3]:
data = mens
region = 'x0'
X = data.drop([region], axis=1)

feature_names = X.columns
y = data[region]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=feature_names)
X_test = pd.DataFrame(scaler.transform(X_test), columns=feature_names)

In [4]:
scores = []
for i in range(data.shape[1]):
    region = 'x'+str(i)
    X = data.drop([region], axis=1)

    feature_names = X.columns
    y = data[region]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=feature_names)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=feature_names)
    model = xgboost.XGBRegressor(max_depth=4,
                      n_estimators=500,
                      eta=0.1,
                      learning_rate=0.08,
                      gamma=0,
                      reg_alpha=0.25,
                      reg_lambda=0.1,
                      subsample=0.95,
                      colsample_bytree=1,
                      random_state=0,
                      n_jobs=20,
                      objective='reg:squarederror',
                      tree_method='gpu_hist',
                      predictor='gpu_predictor',
                      n_gpus=1,
                      gpu_id=0
                          )

    model.fit(X_train, y_train)
#     print('train:', r2_score(y_train, model.predict(X_train)))
#     print('test:', r2_score(y_test, model.predict(X_test)))
#     print('train:', mean_absolute_error(y_train, model.predict(X_train)))
#     print('test:', mean_absolute_error(y_test, model.predict(X_test)))
    scores.append([r2_score(y_train, model.predict(X_train)), r2_score(y_test, model.predict(X_test)),
                   mean_absolute_error(y_train, model.predict(X_train)),
                   mean_absolute_error(y_test, model.predict(X_test))])

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


In [5]:
scores = np.array(scores)

In [6]:
np.round(scores[:, 0:2], 2)

array([[0.72, 0.68],
       [0.81, 0.78],
       [0.69, 0.66],
       [0.79, 0.76],
       [0.69, 0.65],
       [0.77, 0.74],
       [0.88, 0.86],
       [0.77, 0.75],
       [0.72, 0.67],
       [0.78, 0.75],
       [0.77, 0.74],
       [0.78, 0.75],
       [0.66, 0.61],
       [0.6 , 0.54],
       [0.45, 0.39],
       [0.73, 0.69],
       [0.85, 0.84],
       [0.74, 0.71],
       [0.8 , 0.77],
       [0.82, 0.79],
       [0.81, 0.78],
       [0.76, 0.73],
       [0.76, 0.73],
       [0.83, 0.81],
       [0.59, 0.55],
       [0.76, 0.73],
       [0.42, 0.35],
       [0.72, 0.68],
       [0.72, 0.68],
       [0.76, 0.73],
       [0.79, 0.77],
       [0.8 , 0.77],
       [0.64, 0.59],
       [0.41, 0.34],
       [0.52, 0.47],
       [0.82, 0.8 ],
       [0.44, 0.38],
       [0.57, 0.52],
       [0.71, 0.69],
       [0.8 , 0.77],
       [0.74, 0.71],
       [0.84, 0.82],
       [0.77, 0.74],
       [0.7 , 0.67],
       [0.72, 0.69],
       [0.81, 0.78],
       [0.83, 0.81],
       [0.66,

In [None]:
fig = plt.figure(figsize=(16,9))
ax = fig.add_subplot(111)
xgboost.plot_importance(model, max_num_features=50, ax=ax)