# Import Dependencies

In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Pre-process Data

In [42]:
df = pd.read_csv('ExtroVsIntro.csv')

In [43]:
# Number of rows and columns in the DataFrame
df.shape

# Statistical measurements of the DataFrame
df.describe()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
count,4998.0,4998.0,4998.0,4998.0,4998.0,4998.0,4998.0,4998.0
mean,4.452381,0.482993,3.933774,3.023009,0.483593,6.278912,3.560624,0.480392
std,3.430728,0.499761,2.874991,2.233173,0.499781,4.275091,2.883958,0.499665
min,-1.0,0.0,-1.0,-1.0,0.0,-1.0,-1.0,0.0
25%,2.0,0.0,2.0,1.0,0.0,3.0,1.0,0.0
50%,4.0,0.0,4.0,3.0,0.0,5.0,3.0,0.0
75%,7.0,1.0,6.0,5.0,1.0,10.0,6.0,1.0
max,12.0,1.0,11.0,8.0,1.0,16.0,11.0,1.0


In [44]:
df['Personality'].value_counts()

Personality
0    2597
1    2401
Name: count, dtype: int64

In [45]:
df.groupby('Personality').mean()

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
Personality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2.125144,0.074317,5.911051,4.623412,0.074317,9.134001,5.572969
1,6.969596,0.925031,1.795085,1.291962,0.926281,3.190754,1.384007


In [46]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Test Train Split

In [47]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

print(X.shape, train_X.shape, test_X.shape)

print("Training set size:", len(train_X))
print("Test set size:", len(test_X))      


(4998, 7) (3998, 7) (1000, 7)
Training set size: 3998
Test set size: 1000


# ML - Model - Linear Regression
Better for binary classification.

In [48]:
model = LogisticRegression()
model.fit(train_X, train_y)

In [50]:
# Model Evaluation

# Accuracy on the training set
train_X_prediction = model.predict(train_X)
train_X_accuracy = accuracy_score(train_X_prediction, train_y) # train_X features, train_y labels (what we want to predict)
print("Model accuracy on training data:", train_X_accuracy) 

# Accuracy on the test set
test_X_prediction = model.predict(test_X)
test_X_accuracy = accuracy_score(test_X_prediction, test_y)
print("Model accuracy on test data:", test_X_accuracy)


Model accuracy on training data: 0.9277138569284642
Model accuracy on test data: 0.916


# Making a Predicitive System based off the trained model.

In [55]:
input_data = (-0.0,0,5.0,4.0,0,11.0,3.0)  # Example input data

# Input data needs to be converted into a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Precdicting using the trained model
prediction = model.predict(input_data_reshaped)
print("Prediction:", prediction)

if prediction[0] == 1:
    print("The person is an Introvert")
else:
    print("The person is an Extrovert")

Prediction: [0]
The person is an Extrovert


