In [None]:
# https://en.wikipedia.org/wiki/Big_Five_personality_traits
# https://ipip.ori.org/newBigFive5broadKey.htm

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
file_to_open = pd.read_csv("Resources/data.csv", delimiter="\t")
data=file_to_open.copy()
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.isnull().values.any()

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
print(f"Number of participants: {len(data)}")

In [None]:
data.age.value_counts()

In [None]:
# calculate maximum year we're looking for in the data
max_year = 2012-13
print(f"Youngest participient's date of birth: {max_year}")

In [None]:
data= data.loc[data["age"]<2000]
data.head()

In [None]:
data["age"] = [2012-x if x >1960 else x for x in data["age"]]
data= data.loc[data["age"]<=120]
data.head()

In [None]:
data.age.sort_values(ascending=False) 

In [None]:
data.describe()

In [None]:
# data.dtypes

In [None]:
data.country.value_counts()

In [None]:
data.country.value_counts().mean()

In [None]:
countries = pd.DataFrame(data["country"].value_counts())
c_125= countries.loc[countries["country"]>125]
plt.bar(c_125.index,c_125["country"])
plt.show()

In [None]:
c_125.nunique()

In [None]:
# Recheck how many cluster we should have as maximum
from sklearn.preprocessing import MinMaxScaler

df = data.drop("country", axis=1)
trim_df = df.drop(df.columns[:6], axis=1)
columns = list(trim_df.columns)
scaler = MinMaxScaler(feature_range=(0,1))
scale_df = scaler.fit_transform(trim_df)
scale_df = pd.DataFrame(scale_df, columns=columns)
scale_df.head()

In [None]:
#Elbow Method 
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k=(2,15))
visualizer.fit(df)
visualizer.poof()

In [None]:
# cluster analysis- Aysin

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)

In [None]:
kmeans.fit(scale_df)

In [None]:
predicted_clusters = kmeans.predict(scale_df)

In [None]:
centers = kmeans.cluster_centers_
labels = kmeans.labels_
labels

In [None]:
len(labels)

In [None]:
centers

In [None]:
# scale_df = scale_df.add(labels)
scale_df['cluster'] = labels.tolist()
scale_df.head()

In [None]:
# analysis of the clusters- Michael

In [None]:
# analysis of the questions- Stephen

#Find silhouette score of current cluster model

from sklearn import metrics
from sklearn.metrics import pairwise_distances

raw_df = scale_df.drop(columns=["cluster"])

print(metrics.silhouette_score(raw_df, labels, metric='euclidean'))


In [None]:
# Find the standard deviation of responses to each question

question_df = scale_df.groupby("cluster").mean()

question_std = question_df.std()

sorted_std = question_std.sort_values(ascending=False)

sorted_std

In [None]:
#Plot standard deviation to find obvious discontinuities

sorted_std.plot.bar()

In [None]:
print(sorted_std.index)

In [None]:
#Create two subgroups of questions to analyze

cut1 = ['E7', 'N8', 'E5', 'N6', 'E3', 'N10', 'N7', 'E10', 'N1', 'E2', 'N9',
       'E9', 'E4', 'E1', 'E6', 'A7', 'C4', 'N5', 'A9', 'A4', 'N3', 'A2', 'A5',
       'E8', 'A6', 'C6', 'A10', 'N2', 'A8', 'A1', 'A3', 'C8', 'C5', 'C2', 'C9',
       'N4', 'C1']

cut2 = ['E7', 'N8', 'E5', 'N6', 'E3', 'N10', 'N7', 'E10', 'N1', 'E2', 'N9',
       'E9', 'E4', 'E1', 'E6', 'A7', 'C4', 'N5', 'A9', 'A4', 'N3']

brief1_df = raw_df[cut1]

brief2_df = raw_df[cut2]

In [None]:
# Create cluster models for each subgroup

kmeans_brief1 = KMeans(n_clusters=5)
kmeans_brief1.fit(brief1_df)

kmeans_brief2 = KMeans(n_clusters=5)
kmeans_brief2.fit(brief2_df)


In [None]:
#Compare silhouette scores for each subgroup to the original set

brief1_labels = kmeans_brief1.labels_
brief2_labels = kmeans_brief2.labels_

print(metrics.silhouette_score(raw_df, labels, metric='euclidean'))
print(metrics.silhouette_score(brief1_df, brief1_labels, metric='euclidean'))
print(metrics.silhouette_score(brief2_df, brief2_labels, metric='euclidean'))

In [None]:
#Test each question one by one to identify useful questions

#Create data list

colsToDrop = ['E7', 'N8', 'E5', 'N6', 'E3', 'N10', 'N7', 'E10', 'N1', 'E2', 'N9',
       'E9', 'E4', 'E1', 'E6', 'A7', 'C4', 'N5', 'A9', 'A4', 'N3', 'A2', 'A5',
       'E8', 'A6', 'C6', 'A10', 'N2', 'A8', 'A1', 'A3', 'C8', 'C5', 'C2', 'C9',
       'N4', 'C1', 'O2', 'O10', 'O5', 'C7', 'C10', 'O7', 'C3', 'O1', 'O8',
       'O6', 'O4', 'O9', 'O3']

In [None]:
#Check other cluster counts for better coverage with different question sets

clusterCount = [3,4,5,6,7,8,9,10]

#Establish base silhouette scores to use as thresholds for question analysis for each cluster

thresholds = []

for x in clusterCount:

    kmeans_control = KMeans(n_clusters=x)
    kmeans_control.fit(raw_df)

    control_labels = kmeans_control.labels_

    thresholds.append(metrics.silhouette_score(raw_df, control_labels, metric="euclidean"))

thresholds

In [None]:
#establish the silhouette score threshold for each cluster

threshold_df = pd.DataFrame(clusterCount, columns=["Clusters"])
threshold_df["Threshold"] = thresholds

threshold_df

In [None]:
# Run a new cluster analysis for each cluster count, omitting each question one at a time, and record the resulting silhouette score

scores_df = pd.DataFrame()
scores_df["Question"] = np.NaN
scores_df["Score"] = np.NaN
scores_df["N"] = np.NaN

clusterCount = [3,4,5,6,7,8,9,10]
counter = 0

for x in colsToDrop:
    test_df = raw_df.drop(columns=[x])
    
    for y in clusterCount:
        kmeans_test = KMeans(n_clusters=y)
        kmeans_test.fit(test_df)

        test_labels = kmeans_test.labels_

        score = metrics.silhouette_score(test_df, test_labels, metric="euclidean")

        newRow = pd.DataFrame({"Question": [x], "Score": [score], "N": [y]})
        scores_df = pd.concat([scores_df, newRow])
    
scores_df

In [None]:
# For each cluster count, drop all questions where omitting the question resulted in a higher score.
# Re-run the cluster analysis using only qualifying questions.
# Visualize the silhouette scores for each number of clusters
# Save results to a dataframe

from yellowbrick.cluster import SilhouetteVisualizer

trim_df = pd.DataFrame()
trim_df["N"] = np.NaN
trim_df["Score"] = np.NaN
trim_df["Questions"] = np.NaN


for index, row in threshold_df.iterrows():
    temp_df = scores_df[scores_df["N"] == row[0]]
    temp_df = temp_df[temp_df["Score"] < row[1]]
    
    tempQuest = temp_df["Question"].tolist()
    
    tempTrim_df = raw_df[tempQuest]
    
    kmeans_temp = KMeans(n_clusters=int(row[0]))
    kmeans_temp.fit(tempTrim_df)

    temp_labels = kmeans_temp.labels_    

    trim_score = metrics.silhouette_score(tempTrim_df, temp_labels, metric="euclidean")
        
    newRow = pd.DataFrame({"N": row[0], "Score": [trim_score], "Questions": [tempQuest]})
    trim_df = pd.concat([trim_df, newRow])
    
    visualizer = SilhouetteVisualizer(kmeans_temp, colors='yellowbrick')
    
    visualizer.fit(tempTrim_df)
    visualizer.show()
    

trim_df

In [None]:
# demographic analysis-James
#gender, age

In [None]:
#demographic analysis -Hai
# country, handedness,engnat