In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
ha = pd.read_csv("/content/CAH-201803-train.csv")

In [3]:
good_cols = ha.isna().sum() < 100
ha = ha.loc[:,good_cols]

# Drop other NAs
ha = ha.dropna()
ha.head()

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes


In [4]:
X = ha.drop("political_affiliation", axis=1)
y = ha["political_affiliation"]

In [12]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lda_pipe = Pipeline([
    ("ct", ct),
    ("lda", LinearDiscriminantAnalysis())
])

# Parameter grid
paramgridlda = {
    "lda__solver": ["lsqr"],
    "lda__shrinkage": [None, 'auto',.5]
}

# GridSearchCV
lda_grid = GridSearchCV(lda_pipe, param_grid=paramgridlda, cv=5, scoring="accuracy",verbose=0, n_jobs=-1, error_score="raise")
lda_grid.fit(X, y)

# Best parameters and score
print(f"Best params: {lda_grid.best_params_}")
print(f"Best score: {lda_grid.best_score_}")

Best params: {'lda__shrinkage': 0.5, 'lda__solver': 'lsqr'}
Best score: 0.6213903743315508


In [15]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda_pipe = Pipeline([
    ("ct", ct),  # Reuse ColumnTransformer from earlier
    ("qda", QuadraticDiscriminantAnalysis())
])

# Parameter grid for QDA
paramgridqda = {
    "qda__reg_param": [0.0, 0.1, 0.5, 0.9]  # Regularization parameter
}

# GridSearchCV for QDA
qda_grid = GridSearchCV(
    qda_pipe,
    param_grid=paramgridqda,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    error_score="raise"
)
qda_grid.fit(X, y)

# Best parameters and score
print(f"Best QDA params: {qda_grid.best_params_}")
print(f"Best QDA score: {qda_grid.best_score_}")

Best QDA params: {'qda__reg_param': 0.5}
Best QDA score: 0.5859180035650624


In [16]:
from sklearn.svm import SVC

# Pipeline for SVC
svc_pipe = Pipeline([
    ("ct", ct),  # Reuse ColumnTransformer from earlier
    ("svc", SVC())
])

# Parameter grid for SVC
paramgridsvc = {
    "svc__C": [0.1, 1, 10],  # Regularization parameter
    "svc__kernel": ["linear", "rbf"],  # Kernel types
    "svc__gamma": ["scale", "auto"]  # Gamma options
}

# GridSearchCV for SVC
svc_grid = GridSearchCV(
    svc_pipe,
    param_grid=paramgridsvc,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    error_score="raise"
)
svc_grid.fit(X, y)

# Best parameters and score
print(f"Best SVC params: {svc_grid.best_params_}")
print(f"Best SVC cross-validated score: {svc_grid.best_score_}")

Best SVC params: {'svc__C': 1, 'svc__gamma': 'auto', 'svc__kernel': 'rbf'}
Best SVC cross-validated score: 0.5857397504456328


In [17]:
svm_pipe = Pipeline([
    ("ct", ct),  # Reuse ColumnTransformer from earlier
    ("svm", SVC(kernel="poly"))  # Use polynomial kernel
])

# Parameter grid for SVM with polynomial kernel
paramgridsvm = {
    "svm__C": [0.1, 1, 10],  # Regularization parameter
    "svm__degree": [2, 3, 4],  # Polynomial degree
    "svm__gamma": ["scale", "auto"]  # Gamma options
}

# GridSearchCV for SVM
svm_grid = GridSearchCV(
    svm_pipe,
    param_grid=paramgridsvm,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    error_score="raise"
)
svm_grid.fit(X, y)

# Best parameters and score
print(f"Best SVM params: {svm_grid.best_params_}")
print(f"Best SVM cross-validated score: {svm_grid.best_score_}")


Best SVM params: {'svm__C': 1, 'svm__degree': 2, 'svm__gamma': 'auto'}
Best SVM cross-validated score: 0.5798573975044563


In [18]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

# Pipeline for LDA
lda_pipe = Pipeline([
    ("ct", ct),
    ("lda", LinearDiscriminantAnalysis())
])

# Parameter grid for LDA
paramgridlda = {
    "lda__solver": ["lsqr"],
    "lda__shrinkage": [None, "auto", 0.5]
}

# GridSearchCV for LDA
lda_grid = GridSearchCV(
    lda_pipe,
    param_grid=paramgridlda,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    error_score="raise"
)
lda_grid.fit(X, y)

# Best parameters and score
print(f"Best LDA params: {lda_grid.best_params_}")
print(f"Best LDA cross-validated score: {lda_grid.best_score_}")

Best LDA params: {'lda__shrinkage': 0.5, 'lda__solver': 'lsqr'}
Best LDA cross-validated score: 0.6213903743315508


In [19]:
qda_pipe = Pipeline([
    ("ct", ct),
    ("qda", QuadraticDiscriminantAnalysis())
])

# Parameter grid for QDA
paramgridqda = {
    "qda__reg_param": [0.0, 0.1,.2,.3,.4, 0.5,.6,.7,.8, 0.9]  # Regularization parameter
}

# GridSearchCV for QDA
qda_grid = GridSearchCV(
    qda_pipe,
    param_grid=paramgridqda,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    error_score="raise"
)
qda_grid.fit(X, y)

# Best parameters and score
print(f"Best QDA params: {qda_grid.best_params_}")
print(f"Best QDA cross-validated score: {qda_grid.best_score_}")

Best QDA params: {'qda__reg_param': 0.3}
Best QDA cross-validated score: 0.5914438502673797


In [20]:
# Pipeline for KNN
knn_pipe = Pipeline([
    ("ct", ct),
    ("knn", KNeighborsClassifier())
])

# Parameter grid for KNN
paramgridknn = {
    "knn__n_neighbors": [3, 5, 7, 10],
    "knn__weights": ["uniform", "distance"]
}

# GridSearchCV for KNN
knn_grid = GridSearchCV(
    knn_pipe,
    param_grid=paramgridknn,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    error_score="raise"
)
knn_grid.fit(X, y)

# Best parameters and score
print(f"Best KNN params: {knn_grid.best_params_}")
print(f"Best KNN cross-validated score: {knn_grid.best_score_}")

Best KNN params: {'knn__n_neighbors': 5, 'knn__weights': 'distance'}
Best KNN cross-validated score: 0.5502673796791444


In [24]:
class_labels = ["Independent", "Republican", "Democrat"]
for label in class_labels:
    #binary target for the class to loop
    y2_binary = np.where(y == label, 1, 0)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2_binary, test_size=0.2, stratify=y2_binary, random_state=42)

    #train svc
    svc = SVC(probability=True, random_state=1)
    svc.fit(X2_train, y2_train)

    #evaluate
    y2_pred = svc.predict(X2_test)
    print(f"Metrics for SVC: {label} vs. Not {label}")
    print(classification_report(y2_test, y2_pred))

ValueError: could not convert string to float: 'Female'

In [26]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming 'ha' is your dataset
# Define features (X) and target (y)
X = ha.drop("political_affiliation", axis=1)
y = ha["political_affiliation"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define which columns are categorical (adjust this based on your actual data)
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

# Create a column transformer to apply OneHotEncoder to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)  # OneHotEncoder for categorical columns
    ],
    remainder='passthrough'  # Keep other columns as they are
)

# Create a pipeline with preprocessing + logistic regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  # Optional: you can apply scaling after encoding
    ('classifier', LogisticRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")


Accuracy: 0.6274509803921569
Confusion Matrix:
[[13  2  0]
 [ 9  4  4]
 [ 2  2 15]]
Classification Report:
              precision    recall  f1-score   support

    Democrat       0.54      0.87      0.67        15
 Independent       0.50      0.24      0.32        17
  Republican       0.79      0.79      0.79        19

    accuracy                           0.63        51
   macro avg       0.61      0.63      0.59        51
weighted avg       0.62      0.63      0.60        51



In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the pipeline for logistic regression
logreg_pipeline = Pipeline([
    ("preprocessing", ct),  # Assume `ct` is your column transformer
    ("logreg", LogisticRegression(max_iter=1000))  # Logistic Regression with increased iterations for convergence
])

# Define the hyperparameter grid
logreg_params = {
    'logreg__C': [0.01, 0.1, 1, 10, 100]
}

# Perform grid search with accuracy as the scoring metric
loggrid = GridSearchCV(logreg_pipeline, logreg_params, cv=5, scoring='accuracy')
loggrid.fit(X, y)

# Get the best hyperparameters
best_logreg_params = loggrid.best_params_['logreg__C']

# Create a final pipeline with the best parameters
logreg_final_pipeline = Pipeline([
    ("preprocessing", ct),  # Assume `ct` is your column transformer
    ("logreg", LogisticRegression(
        C=best_logreg_params,
        max_iter=1000
    ))
])

# Fit the final model on the entire dataset
logreg_final_pipeline.fit(X, y)

# Evaluate the accuracy on the training set
y_train_pred = logreg_final_pipeline.predict(X)
train_accuracy = accuracy_score(y, y_train_pred)

# Results
print("Best Logistic Regression Parameters:")
print("C:", best_logreg_params)
print("Best cross-validated accuracy:", loggrid.best_score_)
print(f"Training Accuracy: {train_accuracy}")

Best Logistic Regression Parameters:
C: 0.1
Best cross-validated accuracy: 0.6215686274509804
Training Accuracy: 0.7455621301775148


In [45]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the pipeline for logistic regression
logreg_pipeline = Pipeline([
    ("preprocessing", ct),  # Assume `ct` is your column transformer
    ("logreg", LogisticRegression(max_iter=1000))  # Logistic Regression with increased iterations for convergence
])

# Define the expanded hyperparameter grid
logreg_params = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],
    'logreg__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization type
    'logreg__solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'],  # Solvers to choose from
    'logreg__class_weight': [None, 'balanced'],  # Handling imbalanced data
    'logreg__multi_class': ['ovr', 'multinomial','ovo'],  # For multi-class classification
}

# Perform grid search with accuracy as the scoring metric
loggrid = GridSearchCV(logreg_pipeline, logreg_params, cv=5, scoring='accuracy')
loggrid.fit(X, y)

# Get the best hyperparameters
best_logreg_params = loggrid.best_params_['logreg__C']

# Create a final pipeline with the best parameters
logreg_final_pipeline = Pipeline([
    ("preprocessing", ct),
    ("logreg", LogisticRegression(
        C=best_logreg_params,
        max_iter=1000,
        penalty=loggrid.best_params_['logreg__penalty'],
        solver=loggrid.best_params_['logreg__solver'],
        class_weight=loggrid.best_params_['logreg__class_weight'],
        multi_class=loggrid.best_params_['logreg__multi_class']
    ))
])

# Fit the final model on the entire dataset
logreg_final_pipeline.fit(X, y)

# Evaluate the accuracy on the training set
y_train_pred = logreg_final_pipeline.predict(X)
train_accuracy = accuracy_score(y, y_train_pred)

# Results
print("Best Logistic Regression Parameters:")
for param, value in loggrid.best_params_.items():
    print(f"{param}: {value}")
print("Best cross-validated accuracy:", loggrid.best_score_)
print(f"Training Accuracy: {train_accuracy}")
final_model = logreg_final_pipeline

Best Logistic Regression Parameters:
logreg__C: 0.1
logreg__class_weight: balanced
logreg__multi_class: multinomial
logreg__penalty: l2
logreg__solver: saga
Best cross-validated accuracy: 0.6333333333333333
Training Accuracy: 0.7396449704142012


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the pipeline for logistic regression
logreg_pipeline = Pipeline([
    ("preprocessing", ct),  # Assume `ct` is your column transformer
    ("logreg", LogisticRegression(max_iter=1000))  # Logistic Regression with increased iterations for convergence
])

# Define the expanded hyperparameter grid
logreg_params = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],
    'logreg__penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization type
    'logreg__solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'],  # Solvers to choose from
    'logreg__class_weight': [None, 'balanced'],  # Handling imbalanced data
    'logreg__multi_class': ['ovr', 'multinomial','ovo'],  # For multi-class classification
}

# Perform grid search with accuracy as the scoring metric
loggrid = GridSearchCV(logreg_pipeline, logreg_params, cv=10, scoring='accuracy')
loggrid.fit(X, y)

# Get the best hyperparameters
best_logreg_params = loggrid.best_params_['logreg__C']

# Create a final pipeline with the best parameters
logreg_final_pipeline = Pipeline([
    ("preprocessing", ct),  # Assume `ct` is your column transformer
    ("logreg", LogisticRegression(
        C=best_logreg_params,
        max_iter=1000,
        penalty=loggrid.best_params_['logreg__penalty'],
        solver=loggrid.best_params_['logreg__solver'],
        class_weight=loggrid.best_params_['logreg__class_weight'],
        multi_class=loggrid.best_params_['logreg__multi_class']
    ))
])

# Fit the final model on the entire dataset
logreg_final_pipeline.fit(X, y)

# Evaluate the accuracy on the training set
y_train_pred = logreg_final_pipeline.predict(X)
train_accuracy = accuracy_score(y, y_train_pred)

# Results
print("Best Logistic Regression Parameters:")
for param, value in loggrid.best_params_.items():
    print(f"{param}: {value}")
print("Best cross-validated accuracy:", loggrid.best_score_)
print(f"Training Accuracy: {train_accuracy}")
final_model = logreg_final_pipeline

Best Logistic Regression Parameters:
logreg__C: 0.1
logreg__class_weight: balanced
logreg__multi_class: ovr
logreg__penalty: l2
logreg__solver: saga
Best cross-validated accuracy: 0.6632352941176471
Training Accuracy: 0.7159763313609467


In [38]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Define the pipeline for decision tree
dt_pipeline = Pipeline([
    ("preprocessing", ct),  # Assume `ct` is your column transformer
    ("decision_tree", DecisionTreeClassifier())
])

# Hyperparameter grid for 'min_impurity_decrease'
dtt = {'decision_tree__min_impurity_decrease': [ 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}

# Perform grid search with ROC AUC as the scoring metric
dec_grid = GridSearchCV(dt_pipeline, dtt, cv=5, scoring='accuracy')
dec_grid.fit(X, y)

# Get the best hyperparameters
best_impurity_decrease = dec_grid.best_params_['decision_tree__min_impurity_decrease']

# Create a final pipeline using the best 'min_impurity_decrease'
dt_final_pipeline = Pipeline([
    ("preprocessing", ct),  # Assume `ct` is your column transformer
    ("decision_tree", DecisionTreeClassifier(min_impurity_decrease=best_impurity_decrease))
])

# Fit the final model
dtfitted = dt_final_pipeline.fit(X, y)

# Results from the grid search
print("Best Decision Tree Parameters:")
print("Min Impurity Decrease:", best_impurity_decrease)
print("Best cross-validated ROC AUC:", dec_grid.best_score_)

# Evaluate the accuracy on the training set
y_train_pred = dt_final_pipeline.predict(X)
train_accuracy = accuracy_score(y, y_train_pred)

# Final results
print(f"Training Accuracy: {train_accuracy}")


Best Decision Tree Parameters:
Min Impurity Decrease: 0.1
Best cross-validated ROC AUC: 0.5563279857397505
Training Accuracy: 0.5562130177514792


In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Define the pipeline for decision tree
dt_pipeline = Pipeline([
    ("preprocessing", ct),  # Assume `ct` is your column transformer
    ("decision_tree", DecisionTreeClassifier())
])

# Expanded hyperparameter grid for tuning
dtt = {
    'decision_tree__min_impurity_decrease': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'decision_tree__max_depth': [None, 10, 20, 30, 40, 50],
    'decision_tree__min_samples_split': [2, 5, 10, 20],
    'decision_tree__min_samples_leaf': [1, 2, 5, 10]
}

# Perform grid search with ROC AUC as the scoring metric
dec_grid = GridSearchCV(dt_pipeline, dtt, cv=5, scoring='accuracy')
dec_grid.fit(X, y)

# Get the best hyperparameters
best_params = dec_grid.best_params_

# Create a final pipeline using the best parameters
dt_final_pipeline = Pipeline([
    ("preprocessing", ct),  # Assume `ct` is your column transformer
    ("decision_tree", DecisionTreeClassifier(
        min_impurity_decrease=best_params['decision_tree__min_impurity_decrease'],
        max_depth=best_params['decision_tree__max_depth'],
        min_samples_split=best_params['decision_tree__min_samples_split'],
        min_samples_leaf=best_params['decision_tree__min_samples_leaf']
    ))
])

# Fit the final model
dtfitted = dt_final_pipeline.fit(X, y)

# Results from the grid search
print("Best Decision Tree Parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

print("Best cross-validated ROC AUC:", dec_grid.best_score_)

# Evaluate the accuracy on the training set
y_train_pred = dt_final_pipeline.predict(X)
train_accuracy = accuracy_score(y, y_train_pred)

# Final results
print(f"Training Accuracy: {train_accuracy}")
print("Best cross-validated ROC AUC:", dec_grid.best_score_)


Best Decision Tree Parameters:
decision_tree__max_depth: None
decision_tree__min_impurity_decrease: 0.1
decision_tree__min_samples_leaf: 1
decision_tree__min_samples_split: 2
Best cross-validated ROC AUC: 0.5563279857397505
Training Accuracy: 0.5562130177514792
Best cross-validated ROC AUC: 0.5563279857397505


In [8]:
test_data=pd.read_csv("/content/CAH-201803-test.csv")

In [9]:
final_predictions = pd.DataFrame(
    {"id_num": test_data['id_num'],
    "political_affiliation_predicted": final_model.predict(test_data)}
)

In [13]:
final_predictions.to_csv("final_predictions1.csv", index=False)

In [10]:
final_predictions["political_affiliation_predicted"].value_counts()

Unnamed: 0_level_0,count
political_affiliation_predicted,Unnamed: 1_level_1
Independent,61
Democrat,57
Republican,48


In [12]:
l1=pd.read_csv("/content/final_predictions.csv")
l1["political_affiliation_predicted"].value_counts()

Unnamed: 0_level_0,count
political_affiliation_predicted,Unnamed: 1_level_1
Independent,62
Democrat,59
Republican,45
