In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Perceptron
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [10]:
df_raw = pd.read_csv("D:\data-final-selected.csv", sep = "\t", quotechar=" ").dropna()
df_raw.rename(columns = {"\"EXT1" : "EXT1", "long_appx_lots_of_err\"" : "long_appx_lots_of_err"}, \
              inplace=True)
df_raw["EXT1"] = df_raw.EXT1.apply(lambda x: x.replace("\"", ""))
df_raw["long_appx_lots_of_err"] = df_raw["long_appx_lots_of_err"].apply(lambda x: x.replace("\"", ""))
df_raw["EXT1"] = df_raw.EXT1.astype("int64")

In [11]:
df = df_raw[df_raw.columns[:50]].copy()

print(df.head(3))
print("Sample size : {}".format(len(df)))

   EXT1  EXT2  EXT3  EXT4  EXT5  EXT6  EXT7  EXT8  EXT9  EXT10  ...  OPN1  \
0     4     1     5     2     5     1     5     2     4      1  ...     5   
1     3     5     3     4     3     3     2     5     1      5  ...     1   
2     2     3     4     4     3     2     1     3     2      5  ...     5   

   OPN2  OPN3  OPN4  OPN5  OPN6  OPN7  OPN8  OPN9  OPN10  
0     1     4     1     4     1     5     3     4      5  
1     2     4     2     3     1     4     2     5      3  
2     1     2     1     4     2     5     3     4      4  

[3 rows x 50 columns]
Sample size : 10432


In [12]:
#deriving labels

score_E = lambda row: 20 + \
                row["EXT1"] - row["EXT2"] + \
                row["EXT3"] - row["EXT4"] + \
                row["EXT5"] - row["EXT6"] + \
                row["EXT7"] - row["EXT8"] + \
                row["EXT9"] - row["EXT10"]
df["score_E"] = df.apply(score_E, axis=1)

score_A = lambda row: 14 - \
                row["AGR1"] + row["AGR2"] - \
                row["AGR3"] + row["AGR4"] - \
                row["AGR5"] + row["AGR6"] - \
                row["AGR7"] + row["AGR8"] + \
                row["AGR9"] + row["AGR10"]
df["score_A"] = df.apply(score_A, axis=1)

score_C = lambda row: 14 + \
                row["CSN1"] - row["CSN2"] + \
                row["CSN3"] - row["CSN4"] + \
                row["CSN5"] - row["CSN6"] + \
                row["CSN7"] - row["CSN8"] + \
                row["CSN9"] + row["CSN10"]
df["score_C"] = df.apply(score_C, axis=1)

score_N = lambda row: 38 - \
                row["EST1"] + row["EST2"] - \
                row["EST3"] + row["EST4"] - \
                row["EST5"] - row["EST6"] - \
                row["EST7"] - row["EST8"] - \
                row["EST9"] - row["EST10"]
df["score_N"] = df.apply(score_N, axis=1)

score_O = lambda row: 8 + \
                row["OPN1"] - row["OPN2"] + \
                row["OPN3"] - row["OPN4"] + \
                row["OPN5"] - row["OPN6"] + \
                row["OPN7"] + row["OPN8"] + \
                row["OPN9"] + row["OPN10"]
df["score_O"] = df.apply(score_O, axis=1)   

print(df.head(3))

   EXT1  EXT2  EXT3  EXT4  EXT5  EXT6  EXT7  EXT8  EXT9  EXT10  ...  OPN6  \
0     4     1     5     2     5     1     5     2     4      1  ...     1   
1     3     5     3     4     3     3     2     5     1      5  ...     1   
2     2     3     4     4     3     2     1     3     2      5  ...     2   

   OPN7  OPN8  OPN9  OPN10  score_E  score_A  score_C  score_N  score_O  
0     5     3     4      5       36       29       22       26       35  
1     4     2     5      3       10       34       27       25       25  
2     5     3     4      4       15       32       24       24       31  

[3 rows x 55 columns]


In [13]:
def personality(row):
    big_letter = 0
    highest_score = 0
    if row["score_E"] > highest_score:
        highest_score = row["score_E"]
        big_letter = 0 # mapped to E
    if row["score_A"] > highest_score:
        highest_score = row["score_A"]
        big_letter = 1 # mapped to A
    if row["score_C"] > highest_score:
        highest_score = row["score_C"]
        big_letter = 2 #mapped to C
    if row["score_N"] > highest_score:
        highest_score = row["score_N"]
        big_letter = 3 #mapped to N
    if row["score_O"] > highest_score:
        highest_score = row["score_O"]
        big_letter = 4 #mapped to O
    return big_letter

df["target"] = df.apply(personality, axis=1)
print(df.head(3))

   EXT1  EXT2  EXT3  EXT4  EXT5  EXT6  EXT7  EXT8  EXT9  EXT10  ...  OPN7  \
0     4     1     5     2     5     1     5     2     4      1  ...     5   
1     3     5     3     4     3     3     2     5     1      5  ...     4   
2     2     3     4     4     3     2     1     3     2      5  ...     5   

   OPN8  OPN9  OPN10  score_E  score_A  score_C  score_N  score_O  target  
0     3     4      5       36       29       22       26       35       0  
1     2     5      3       10       34       27       25       25       1  
2     3     4      4       15       32       24       24       31       1  

[3 rows x 56 columns]


In [14]:
X = df[df.columns[:50]].copy()
y = df["target"]

X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 6)

classifier = Perceptron (max_iter = 40)
classifier.fit(X_train, y_train)

print("Score on training data: {:.4f}".format(classifier.score(X_train, y_train)))
print("Score on testing data: {:.4f}".format(classifier.score(X_test, y_test)))


Score on training data: 0.7919
Score on testing data: 0.7671


In [15]:
# 4 fictitious characters 

Willy = np.array([5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, \
                  5.0, 1.0, 5.0, 1.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, \
                  5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 1.0, 1.0, 1.0, 5.0, \
                  1.0, 5.0, 1.0, 5.0, 1.0, 1.0, 1.0, 1.0, 5.0, 1.0, \
                  1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 1.0, 1.0, 1.0])
Xavier = np.array([1.0, 5.0, 3.0, 5.0, 1.0, 4.0, 1.0, 5.0, 1.0, 4.0, \
                   2.0, 3.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 2.0, 1.0, \
                   4.0, 2.0, 3.0, 2.0, 4.0, 2.0, 4.0, 2.0, 1.0, 3.0, \
                   2.0, 2.0, 1.0, 4.0, 4.0, 1.0, 3.0, 4.0, 1.0, 1.0, \
                   1.0, 5.0, 2.0, 4.0, 2.0, 4.0, 2.0, 1.0, 4.0, 2.0])
Yenny = np.array([5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, \
                  1.0, 2.0, 2.0, 4.0, 2.0, 3.0, 4.0, 2.0, 4.0, 1.0, \
                  1.0, 4.0, 2.0, 2.0, 4.0, 4.0, 4.0, 3.0, 2.0, 4.0, \
                  5.0, 1.0, 5.0, 1.0, 4.0, 4.0, 4.0, 1.0, 5.0, 5.0, \
                  5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 5.0, 5.0, 5.0])
Zee = np.array([2.0, 1.0, 5.0, 4.0, 5.0, 1.0, 1.0, 2.0, 1.0, 3.0, \
                1.0, 5.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, \
                2.0, 4.0, 1.0, 5.0, 2.0, 5.0, 2.0, 5.0, 5.0, 5.0, \
                4.0, 2.0, 2.0, 1.0, 1.0, 4.0, 5.0, 4.0, 2.0, 1.0, \
                4.0, 5.0, 2.0, 5.0, 2.0, 4.0, 3.0, 3.0, 5.0, 1.0])

fictitious_individuals = np.array([Willy, Xavier, Yenny, Zee])

print(classifier.predict(fictitious_individuals))

big_five_dict = {0:"E", 1:"A", 2:"C", 3:"N", 4:"O"}
print("Willy's Personality : {}\nXavier's Personality: {}\nYenny's Personality: {}\nZee's Personality: {}".format(big_five_dict[0], big_five_dict[3], big_five_dict[4], big_five_dict[1]))


[0 3 4 1]
Willy's Personality : E
Xavier's Personality: N
Yenny's Personality: O
Zee's Personality: A
