In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer, make_column_selector

In [4]:
myData = pd.read_csv("/Users/dan/calpoly/BusinessAnalytics/GSB544MACHINE/Final/DataClassification/CAH-201803-train.csv")
myData.head()

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes


In [5]:
# Create target and predictor variables and split
X = myData.drop(columns=['political_affiliation'])
y = myData['political_affiliation']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

In [19]:
pipeline_logistic = Pipeline([
    ('preprocessor', ct),
    ('regressor', LogisticRegression(max_iter=1000))
])

# Create grid for tuning
logreg_params = {'regressor__C': [1]}
logreg_grid = GridSearchCV(pipeline_logistic, logreg_params, cv=5, scoring='accuracy')
logreg_grid.fit(X_train, y_train)

# Best Logistic Regression model
print(f"Best Logistic Regression parameters: {logreg_grid.best_params_}")
print(f"\nBest Accuracy: {logreg_grid.best_score_}")

# Confusion matrix
y_pred_logreg = logreg_grid.best_estimator_.predict(X_test)
print("\nConfusion Matrix:\n",confusion_matrix(y_test, y_pred_logreg))

Best Logistic Regression parameters: {'regressor__C': 1}

Best Accuracy: 0.6101538461538462

Confusion Matrix:
 [[10  2  3]
 [ 4  7  3]
 [ 4  1  9]]


In [20]:
test_data = pd.read_csv("/Users/dan/calpoly/BusinessAnalytics/GSB544MACHINE/Final/DataClassification/CAH-201803-test.csv")

final_predictions = pd.DataFrame(
    {"id_num": test_data['id_num'],
    "political_affiliation_predicted": final_model_fit.predict(test_data)}
)

final_predictions.to_csv("/Users/dan/calpoly/BusinessAnalytics/GSB544MACHINE/Final/DataClassification/predicted_political2.csv", index=False)