In [1]:
import os

import numpy as np
import pandas as pd
import statsmodels.regression.linear_model as lm
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Overview

## Data Cleaning
The only urgent thing to do is remove the 'Major' column, since its values are not numbers. One option is to change them to corresponding numbers 1-4, but our approach is to make a separate file with responses filtered by major. TODO: we also want to check for survey responses that don't make sense (answered just the default answer for all questions); these should be thrown out.

## Feature Engineering
First, we create $\mathbf{y}$, the feature we would like to predict, as the normalized average of responses to questions 1-15 on the survey. Those questions include, for example, "Secret organizations communicate with extraterrestrials, but keep this fact from the public" and "The spread of certain viruses and/or diseases is the result of the deliberate, concealed efforts of some organization", and participants are asked to respond with their level of agreement from 1 to 5.

In [None]:
os.chdir('data')

In [4]:
df = pd.read_csv("conspiracy_theories_data_orig.csv")
verbose = False

# Measure for General Conspiracy Belief. Normalized average of responses to questions 1-15 of survey
df['GCB'] = df[['Q'+str(i) for i in range(1, 16)]].mean(axis=1) / 5
df.drop(columns=['Q'+str(i) for i in range(1, 16)], inplace=True)

# The survey asked participants what words they knew. Columns VCL6, VCL9, VCL12 were not real words, and were included in 
# order to perform a validity check
df['validity'] = df[['VCL6', 'VCL9', 'VCL12']].mean(axis=1)

# Score how many vocab questions the respondent answered correctly. 0 is correct for VCL 6, 9, 12, and 1 is correct for all others.
df['vocabulary_knowledge'] = (df[['VCL' + str(i) for i in [1, 2, 3, 4, 5, 7, 8, 10, 11, 13, 14, 15, 16]]]).mean(axis=1) 

df.drop(columns=['VCL'+str(i) for i in range(1, 17)], inplace=True)

#I split up every instance of "major" to a category: HUM (Humanities), BUS (business/law), ART, STEM, and OTHER. 
#This block creates a one-hot encoding for each of these.
names = ["STEM", "HUM", "BUS", "OTHER", "ART"]
for name in names:
    with open(f"{name}.txt") as tf:
        majors = [i[:-2] for i in tf.readlines()]
        func = np.vectorize(lambda x: x in majors)
#         df[name] = 1 
        df[name] = df.major.apply(func) # Create  a new column with the one hot encoding for the given category
    
# One hot encode the other features
categorical_columns = ['education','urban', 'gender', 'engnat', 'hand', 'religion', 'orientation','race', 'voted', 'married']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
df["constant"] = 1
df.head()

Unnamed: 0,E1,E2,E3,E4,E5,E6,E7,E8,E9,E10,...,race_2,race_3,race_4,race_5,voted_1,voted_2,married_1,married_2,married_3,constant
0,7070,7469,7383,6540,9098,4998,6971,4713,6032,5878,...,0,0,0,1,0,1,1,0,0,1
1,4086,13107,2807,5030,7405,7864,16234,2603,14174,9423,...,0,0,1,0,0,1,1,0,0,1
2,27535,7814,7762,10290,8558,10538,4740,4162,6492,11512,...,0,0,1,0,1,0,1,0,0,1
3,4561,5589,3506,3784,5093,3555,3158,1887,7678,2304,...,0,0,1,0,1,0,1,0,0,1
4,8841,7575,3832,7775,4160,5216,7559,5792,10296,5455,...,0,0,1,0,0,1,0,1,0,1


In [5]:
# df["introelapse"].hist()
df["total_time_taken_(mins)"] = (df["introelapse"] + df["testelapse"] + df["surveyelapse"])/60
df["total_survey_time_taken_(mins)"] = (df["testelapse"] + df["surveyelapse"])/60

print("# Surveys that took over an hour to take (including landing pad time)")
print(sum(df["total_time_taken_(mins)"] >= 60))

print("# Surveys that took over an hour to take (excluding landing pad time)")
print(sum(df["total_survey_time_taken_(mins)"] >= 60))

print("# Surveys that spent over an hour on the landing pad")
print(sum(df["introelapse"]/60 >= 60))

if verbose: 
    df["total_time_taken_(mins)"][df["total_time_taken_(mins)"] < 60].hist()
    plt.subplots()
    df["total_survey_time_taken_(mins)"][df["total_survey_time_taken_(mins)"] < 60].hist()
    plt.subplots()
    df["introelapse"][df["introelapse"] < 60].hist()

# Even though these surveys took a lot longer than seems reasonable, there are no clear indications in the 
# subjects' answers that any of these responses should be dropped. 

# Surveys that took over an hour to take (including landing pad time)
64
# Surveys that took over an hour to take (excluding landing pad time)
20
# Surveys that spent over an hour on the landing pad
44


In [10]:
if verbose: 
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(df[df["introelapse"]/60 >= 60])

In [11]:
# Looking at the 50 fastest responses
if verbose:
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(df.sort_values(by="total_time_taken_(mins)")[:50])
    
# Again, none of these look totally wrong. 

In [12]:
# Did any respondents put the same thing for each question in the GCB inventory? 
print("# Rows with matching entries in columsn Q1, Q2, ..., Q15")
print(sum(df[["Q" + str(i) for i in range(1, 16)]].apply(lambda x: min(x) == max(x), 1)))
if verbose:
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(df[df[["Q" + str(i) for i in range(1, 16)]].apply(lambda x: min(x) == max(x), 1)])


# Rows with matching entries in columsn Q1, Q2, ..., Q15


KeyError: "None of [Index(['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',\n       'Q12', 'Q13', 'Q14', 'Q15'],\n      dtype='object')] are in the [columns]"

## Results

In [28]:
y = df['GCB']
X = df.drop(columns=['GCB', 'major']).astype(float)

# Run a L1-regularization to see which features we should keep and what we can get rid of
lamb_list = np.geomspace(10**-10, 10**5, 16)

# Go through each lambda
#for lamb in lamb_list:
model = lm.OLS(y, X).fit_regularized(alpha=1e-3, L1_wt=1)
beta = model.params
nonzero = np.abs(beta) > 0.01
X = X[X.columns[nonzero.values]]
X['const'] = df['constant']
X = X.drop(columns=['TIPI4', 'TIPI6', 'engnat_1', 'gender_2'])
np.asarray(X)
model = lm.OLS(y, X).fit()
print(model.summary().as_latex())

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}    &       GCB        & \textbf{  R-squared:         } &     0.128   \\
\textbf{Model:}            &       OLS        & \textbf{  Adj. R-squared:    } &     0.124   \\
\textbf{Method:}           &  Least Squares   & \textbf{  F-statistic:       } &     30.39   \\
\textbf{Date:}             & Wed, 17 Nov 2021 & \textbf{  Prob (F-statistic):} &  1.16e-65   \\
\textbf{Time:}             &     12:17:05     & \textbf{  Log-Likelihood:    } &    563.65   \\
\textbf{No. Observations:} &        2495      & \textbf{  AIC:               } &    -1101.   \\
\textbf{Df Residuals:}     &        2482      & \textbf{  BIC:               } &    -1026.   \\
\textbf{Df Model:}         &          12      & \textbf{                     } &             \\
\textbf{Covariance Type:}  &    nonrobust     & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{lcccccc}
                        & \textbf{coef}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['const'] = df['constant']


## Conclusion
$R^2$ is small, but we found some variables that are slightly correlated with belief in conspiracy theories, blah blah blah.