<a href="https://colab.research.google.com/github/dea1013/CS675-Project/blob/main/CS675_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing

## Import modules

In [None]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor

## Import data

#### Import KPMI data

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Spring 2022/\
Machine Learning/Project/data/kpmi_data.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


#### Import Questions JSON

In [None]:
with open('/content/drive/MyDrive/Spring 2022/Machine Learning/\
Project/data/questionnaire_schema.json') as json_file:
  raw_qas = json.load(json_file)

## Variables/Functions for Later Use

#### Create Question/Answer Dictionary

In [None]:
# create dictionary so that question/answer can be derived from a question number

qas = {}
for qa in raw_qas:
  name = 'q' + str(qa['position'])
  q = [dic for dic in qa['text'] if dic['lang'] == 'en'][0]['value']
  a = [dic['text'] for dic in qa['answers']]
  a = [dic[i] for dic in a for i in range(len(dic))]
  a = [dic['value'] for dic in a if dic['lang'] == 'en']
  qas[name] = {'q':q,'a':a}
qas['q1']

{'a': ['communicative', 'quite restrained and calm'], 'q': 'Usually you:'}

#### Create new columns/features

In [None]:
# create list of column names
q_cols = [col for col in df.columns if col[0] == "q"]
q_cols[:5]

['q1', 'q2', 'q3', 'q4', 'q5']

In [None]:
# create binary columns for the scale values
bin_cols = ["E/I","S/N","T/F","J/P"]
for i,col in enumerate(bin_cols):
  df[col] = (df["psychotype"].str[i] == col[0]).astype(int)
df[bin_cols+["psychotype"]].head()

Unnamed: 0,E/I,S/N,T/F,J/P,psychotype
0,0,1,1,1,ISTJ
1,0,1,1,1,ISTJ
2,1,0,1,1,ENTJ
3,1,1,1,1,ESTJ
4,1,0,0,1,ENFJ


In [None]:
# weight columns
scale_cols = [col for col in df.columns if "scale" in col]
weight_cols = [col+"_weight" for col in bin_cols]
for i,col in enumerate(weight_cols):
  df[col] = (df[scale_cols[i*2]] - df[scale_cols[(i*2)+1]]).abs()
  df[col] = (df[col] - df[col].min())/(df[col].max() - df[col].min())
df[weight_cols].head()

Unnamed: 0,E/I_weight,S/N_weight,T/F_weight,J/P_weight
0,0.6,0.361111,0.457143,0.485714
1,0.171429,0.027778,0.342857,0.2
2,0.628571,0.111111,0.114286,0.142857
3,0.514286,0.333333,0.057143,0.285714
4,0.4,0.027778,0.285714,0.8


#### Create functions

In [None]:
def to_type(x):
  return ("E" if x[0] else "I") +\
  ("S" if x[1] else "N") +\
  ("T" if x[2] else "F") +\
  ("J" if x[3] else "P")

In [None]:
def eval(pred,actual):
  correct = np.sum(np.all(pred == actual,axis=1))
  total = len(pred)
  return {"Correct": correct,
          "Incorrect": total - correct,
          "Total": total,
          "Accuracy": correct/total}

In [None]:
def predict(test,models): return np.array([m.predict(test) for m in models]).T

In [None]:
def std(x): return (x - x.min())/(x.max() - x.min())

In [None]:
def print_qs(qnums):
  for qnum in qnums:
    qa = qas[qnum]
    print(f"{qnum}. {qa['q']}")
    print(f" (0) {qa['a'][0]}")
    print(f" (1) {qa['a'][1]}")
print_qs(['q5'])

q5. Being in a company you usually:
 (0) join the conversation
 (1) talk with each separately


# Task 1: Learning the Questionnaire Perfectly

## Test-Train Split

In [None]:
# train-test split
X = df[q_cols]
y = df[bin_cols]
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1, stratify=y)

## Training

In [None]:
def train_lrs(X_train,y_train):
  lrs = [] # logistic regression models
  for col in bin_cols:
      y_train2 = y_train[col]
      lr = LogisticRegression(C=100,
                              max_iter=500,
                              random_state=1,
                              solver='lbfgs')
      lr.fit(X_train, y_train2)
      lrs.append(lr)
  return lrs
perf_lrs = train_lrs(X_train,y_train)

## Evaluation

In [None]:
# testing accuracy
eval(predict(X_test,perf_lrs),y_test)

{'Accuracy': 1.0, 'Correct': 6554, 'Incorrect': 0, 'Total': 6554}

In [None]:
avg_abs_coefs = np.mean([np.abs(lr.coef_[0]) for lr in perf_lrs],axis=0)

# Task 2: Creating a Reduced Questionnaire

In [None]:
# number of questions to be removed
N = 10

## Using the Logistic Regression Model Coefficients

### Attribute Selection

In [None]:
avg_abs_coefs_std = std(avg_abs_coefs)
results_df = pd.DataFrame({"Q": q_cols,
                           "Avg Abs Coeff": avg_abs_coefs,
                           "Avg Abs Coeff Std": avg_abs_coefs_std})

In [None]:
# top 10 least important features
aac_df = results_df.sort_values(by="Avg Abs Coeff Std").head(N)
aac_df

Unnamed: 0,Q,Avg Abs Coeff,Avg Abs Coeff Std
115,q116,1.599984,0.0
127,q128,1.601402,0.000258
131,q132,1.604713,0.000861
91,q92,1.620894,0.003807
119,q120,1.622168,0.004039
114,q115,1.631863,0.005804
107,q108,1.636557,0.006659
135,q136,1.637995,0.006921
95,q96,1.638906,0.007086
86,q87,1.648988,0.008922


### Test-Train Split

In [None]:
# train-test split
X = df[q_cols]
X = X.drop(aac_df['Q'],axis=1)
y = df[bin_cols]
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1, stratify=y)

### Training

In [None]:
lrs = train_lrs(X_train,y_train)

### Evaluation

In [None]:
# testing accuracy
eval(predict(X_test,lrs),y_test)

{'Accuracy': 0.9627708269758926,
 'Correct': 6310,
 'Incorrect': 244,
 'Total': 6554}

In [None]:
# seeing where mistakes are made
X = df[q_cols]
X = X.drop(aac_df['Q'].head(1),axis=1)
y = df[bin_cols]
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1, stratify=y)
lrs = train_lrs(X_train,y_train)
comp_df = pd.concat([pd.Series(np.apply_along_axis(to_type,1,predict(X_test,lrs))),
                     df.iloc[X_test.index][scale_cols+['psychotype']].reset_index(drop=True)],axis=1)
comp_df[comp_df[0] != comp_df['psychotype']][scale_cols+['psychotype']].head()

Unnamed: 0,scale_e,scale_i,scale_s,scale_n,scale_t,scale_f,scale_j,scale_p,psychotype
217,12,23,9,22,18,16,16,16,INTJ
317,18,20,19,16,19,16,17,17,ISTJ
330,24,13,17,14,12,25,17,18,ESFP
341,22,11,18,14,24,12,16,17,ESTP
549,19,17,16,23,12,21,20,20,ENFJ


## Using "Partial" Greedy Attribute Selection

### Attribute Selection

In [None]:
X = df[q_cols]
y = df[bin_cols]
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1, stratify=y)
train_ix = X_train.sample(frac=1,random_state=1).index
test_ix = X_test.sample(frac=1,random_state=1).index

In [None]:
new_accs = []
for i,q in enumerate(q_cols):
  X_train2 = X_train.copy().drop(q,axis=1)
  X_test2 = X_test.copy().drop(q,axis=1)
  lrs = train_lrs(X_train2,y_train)
  new_accs.append(eval(predict(X_test2,lrs),y_test)['Accuracy'])

In [None]:
results_df['Drop Accuracy'] = new_accs
results_df['Drop Accuracy Std'] = 1 - std(results_df['Drop Accuracy'])

In [None]:
da_df = results_df[['Q','Drop Accuracy','Drop Accuracy Std']].sort_values(by="Drop Accuracy Std").head(N)
da_df

Unnamed: 0,Q,Drop Accuracy,Drop Accuracy Std
112,q113,0.997101,0.0
113,q114,0.996033,0.023256
98,q99,0.99466,0.053156
74,q75,0.993897,0.069767
99,q100,0.993287,0.083056
97,q98,0.993287,0.083056
95,q96,0.993287,0.083056
107,q108,0.993134,0.086379
63,q64,0.992829,0.093023
126,q127,0.992676,0.096346


In [None]:
results_df[['Avg Abs Coeff Std','Drop Accuracy Std']].corr()

Unnamed: 0,Avg Abs Coeff Std,Drop Accuracy Std
Avg Abs Coeff Std,1.0,0.938607
Drop Accuracy Std,0.938607,1.0


### Test-Train Split

In [None]:
# train-test split
X = df[q_cols]
X = X.drop(da_df['Q'],axis=1)
y = df[bin_cols]
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1, stratify=y)

### Training

In [None]:
lrs = train_lrs(X_train,y_train)

### Evaluation

In [None]:
# testing accuracy
eval(predict(X_test,lrs),y_test)

{'Accuracy': 0.9609398840402807,
 'Correct': 6298,
 'Incorrect': 256,
 'Total': 6554}

In [None]:
# seeing where mistakes are made
comp_df = pd.concat([pd.Series(np.apply_along_axis(to_type,1,predict(X_test,lrs))),
                     df.iloc[X_test.index][scale_cols+['psychotype']].reset_index(drop=True)],axis=1)
comp_df[comp_df[0] != comp_df['psychotype']][scale_cols+['psychotype']].head()

Unnamed: 0,scale_e,scale_i,scale_s,scale_n,scale_t,scale_f,scale_j,scale_p,psychotype
20,19,15,31,6,16,20,16,18,ESFP
47,17,20,14,17,17,18,15,19,INFP
49,25,13,19,18,16,12,24,16,ESTJ
84,26,14,14,24,11,25,14,17,ENFP
89,23,14,6,23,24,9,15,16,ENTP


## Using Combination of Weights

In [None]:
results_df['Weight'] = results_df[['Avg Abs Coeff Std','Drop Accuracy Std']].mean(axis=1)

In [None]:
weight_df = results_df.sort_values(by="Weight").head(N)
weight_df

Unnamed: 0,Q,Avg Abs Coeff,Avg Abs Coeff Std,Drop Accuracy,Drop Accuracy Std,Weight
112,q113,1.731306,0.02391,0.997101,0.0,0.011955
113,q114,1.822424,0.040499,0.996033,0.023256,0.031877
98,q99,1.715312,0.020997,0.99466,0.053156,0.037077
95,q96,1.638906,0.007086,0.993287,0.083056,0.045071
74,q75,1.725781,0.022904,0.993897,0.069767,0.046336
107,q108,1.636557,0.006659,0.993134,0.086379,0.046519
99,q100,1.660288,0.010979,0.993287,0.083056,0.047018
63,q64,1.67881,0.014352,0.992829,0.093023,0.053687
126,q127,1.661152,0.011137,0.992676,0.096346,0.053741
139,q140,1.667576,0.012306,0.992524,0.099668,0.055987


In [None]:
weight_df['Q'].values.tolist()

['q113', 'q114', 'q99', 'q96', 'q75', 'q108', 'q100', 'q64', 'q127', 'q140']

### Test-Train Split

In [None]:
# train-test split
X = df[q_cols]
X = X.drop(weight_df['Q'],axis=1)
y = df[bin_cols]
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1, stratify=y)

### Training

In [None]:
lrs = train_lrs(X_train,y_train)

### Evaluation

In [None]:
# testing accuracy
eval(predict(X_test,lrs),y_test)

{'Accuracy': 0.9629234055538602,
 'Correct': 6311,
 'Incorrect': 243,
 'Total': 6554}

In [None]:
# seeing where mistakes are made
comp_df = pd.concat([pd.Series(np.apply_along_axis(to_type,1,predict(X_test,lrs))),
                     df.iloc[X_test.index][scale_cols+['psychotype']].reset_index(drop=True)],axis=1)
comp_df[comp_df[0] != comp_df['psychotype']][scale_cols+['psychotype']].head()

Unnamed: 0,scale_e,scale_i,scale_s,scale_n,scale_t,scale_f,scale_j,scale_p,psychotype
20,19,15,31,6,16,20,16,18,ESFP
47,17,20,14,17,17,18,15,19,INFP
89,23,14,6,23,24,9,15,16,ENTP
138,16,21,16,15,11,24,16,16,ISFJ
185,16,16,17,24,16,12,22,14,ENTJ


## Most/Least Important Questions

In [None]:
# least important questions
least_qs = results_df.sort_values(by="Weight").head(N)['Q']
print_qs(least_qs)

q113. Can you be called a person who wants to constantly master and understand the new?
 (0) rather yes
 (1) probably not
q114. In a dispute, you:
 (0) argue your position
 (1) quickly jump to conclusions by skipping facts
q99. You would prefer to work with a manager who:
 (0) always kind
 (1) always fair
q96. Do you find that monotonous and familiar activities throughout the day:
 (0) reassure you
 (1) bore you
q75. What word in the “who - what” pair do you like more in meaning?
 (0) who
 (1) what
q108. You would be pleased to work with a boss who:
 (0) always accurate, punctual
 (1) always full of new ideas
q100. Your friend is easier to become someone who:
 (0) shares your standards and point of view
 (1) is an interesting person
q64. What word in the pair “swift - thorough” do you like more in meaning?
 (0) swift
 (1) thorough
q127. Can you say that you:
 (0) take emotional problems to heart
 (1) treat emotional problems as a hindrance in solving problems
q140. Preferably you:
 (0)

In [None]:
# most important questions
print_qs(results_df.sort_values(by="Weight").tail(N)['Q'])

q94. A higher praise for a person will be recognition:
 (0) his foresight, intuition
 (1) his common sense
q34. What word in the pair “theory - practice” do you like more in meaning?
 (0) theory
 (1) practice
q4. When you decide to go somewhere for one day, you usually:
 (0) plan what and when you will do
 (1) just hit the road
q31. What word in the pair of “thoughts - feelings” do you like more in meaning?
 (0) thoughts
 (1) feelings
q2. If you were a teacher, with great pleasure you would lead:
 (0) practical lessons
 (1) theoretical studies
q11. You usually:
 (0) value feelings more than logic
 (1) value logic more than feelings
q21. Usually you:
 (0) can talk freely with almost any person at any time
 (1) don't always find what to say to a stranger
q9. In a large company more often:
 (0) you introduce people to each other
 (1) you are introduced to others
q1. Usually you:
 (0) communicative
 (1) quite restrained and calm
q10. You can most likely be called:
 (0) practical person
 (1

# Task 3: Filling Missing Questions with Imputation

## Statistical Imputation

### Test-Train Split

In [None]:
# train-test split
X = df[q_cols]
y = df[bin_cols]
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1, stratify=y)

### Statistical Imputation Model

In [None]:
# median of each question
def stat_imputation(X_train,X_test):
  means = X_train.mean()[least_qs].values
  X_test[least_qs] = means
  return X_test

In [None]:
X_test = stat_imputation(X_train,X_test)

### Evaluation

In [None]:
# testing accuracy
eval(predict(X_test,perf_lrs),y_test)

{'Accuracy': 0.9610924626182484,
 'Correct': 6299,
 'Incorrect': 255,
 'Total': 6554}

## Autoencoder Imputation

### Test-Train Split

In [None]:
# train-test split
X = df[q_cols]
y = df[bin_cols]
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1, stratify=y)

### Autoencoder Imputation Model

In [None]:
subX = X_train.drop(least_qs,axis=1)

mlp_w = MLPRegressor(solver = 'adam',
                     alpha=0,
                     hidden_layer_sizes=(10,4,10),
                     max_iter= 10000,
                     random_state=1,
                     activation = 'relu',
                     tol=1e-20)

mlp_w.fit(subX,X_train)

print(f'Iterations: {mlp_w.n_iter_}')
print(f'Loss: {mlp_w.loss_}')

Iterations: 366
Loss: 0.0883871848472966


In [None]:
# apply model
X_pred = mlp_w.predict(X_test.drop(least_qs,axis=1))
X_test[least_qs] = X_pred.T[least_qs.index.tolist()].T

### Evaluation

In [None]:
# testing accuracy
eval(predict(X_test,perf_lrs),y_test)

{'Accuracy': 0.9679584986267928,
 'Correct': 6344,
 'Incorrect': 210,
 'Total': 6554}