In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import statsmodels.regression.linear_model as lm

### Load Data and Engineer Features

In [2]:
data_directory = "data/"
df = pd.read_csv(data_directory + "conspiracy_theories_data_orig.csv")
verbose = False
# Only NaN values are in "major" column, so no other cleaning is necessary to remove NaN values
# Benefit of working with survey data as opposed to data collected using messier methods

# Measure for General Conspiracy Belief. Normalized average of responses to questions 1-15 of survey
df['GCB'] = df[['Q'+str(i) for i in range(1, 16)]].mean(axis=1) / 5

# Score how many vocab questions the respondent answered correctly. 0 is correct for VCL 6, 9, 12, and 1 is correct for all others.
df['vocabulary_knowledge'] = pd.concat((df[['VCL' + str(i) for i in [1, 2, 3, 4, 5, 7, 8, 10, 11, 13, 14, 15, 16]]], 
                                        (1 - df[['VCL' + str(i) for i in [6,9,12]]])), axis=1).mean(axis=1)

# The survey asked participants what words they knew. Columns VCL6, VCL9, VCL12 were not real words, and were included in 
# order to perform a validity check
df['vocabulary_misclassification'] = df[['VCL6', 'VCL9', 'VCL12']].mean(axis=1)

# Split up every instance of "major" to a category: HUM (Humanities), BUS (business/law), ART, STEM, and OTHER. 
# This block creates a one-hot encoding for each of these.
names = ["STEM", "HUM", "BUS", "OTHER", "ART"]
for name in names:
    # For each category, there is a file of strings of majors that should be classified as that category
    # Read in the corresponding file
    tf = open(data_directory + f"{name}.txt", "r",newline='\n')
    # Grab all the strings in the file
    majors = [i[:-2] for i in tf.readlines()]
    def func(x): # If string is in the list of majors, return a 1, else a 0
        return int(x in majors)
    func = np.vectorize(func)
    df[name] = 1 
    df[name] = df.major.apply(func) # Create  a new column with the one hot encoding for the given category
    
# One hot encode the other categorical features
categorical_columns = ['education','urban', 'gender', 'engnat', 'hand', 'religion', 'orientation','race', 'voted', 'married']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
df["constant"] = 1

# Engineer some features about the time taken on the survey for reviewing edge cases (not to be used in regression).
df["total_time_taken_(mins)"] = (df["introelapse"] + df["testelapse"] + df["surveyelapse"])/60
df["total_survey_time_taken_(mins)"] = (df["testelapse"] + df["surveyelapse"])/60

### Review Edge Cases 

In [3]:
print("# Surveys that took over an hour to take (including landing pad time)")
print(sum(df["total_time_taken_(mins)"] >= 60))

print("# Surveys that took over an hour to take (excluding landing pad time)")
print(sum(df["total_survey_time_taken_(mins)"] >= 60))

print("# Surveys that spent over an hour on the landing pad")
print(sum(df["introelapse"]/60 >= 60))

if verbose: 
    df["total_time_taken_(mins)"][df["total_time_taken_(mins)"] < 60].hist()
    plt.subplots()
    df["total_survey_time_taken_(mins)"][df["total_survey_time_taken_(mins)"] < 60].hist()
    plt.subplots()
    df["introelapse"][df["introelapse"] < 60].hist()

# Even though these surveys took a lot longer than seems reasonable, there are no clear indications in the 
# subjects' answers that any of these responses should be dropped. 

# Surveys that took over an hour to take (including landing pad time)
64
# Surveys that took over an hour to take (excluding landing pad time)
20
# Surveys that spent over an hour on the landing pad
44


In [4]:
if verbose: 
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(df[df["introelapse"]/60 >= 60])

In [5]:
# Looking at the 50 fastest responses
if verbose:
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(df.sort_values(by="total_time_taken_(mins)")[:50])
    
# Again, none of these look responses have any obvious indications that they should be dropped

In [6]:
# Did any respondents put the same thing for each question in the GCB inventory? 
print("# Rows with matching entries in columsn Q1, Q2, ..., Q15")
print(sum(df[["Q" + str(i) for i in range(1, 16)]].apply(lambda x: min(x) == max(x), 1)))
if verbose:
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(df[df[["Q" + str(i) for i in range(1, 16)]].apply(lambda x: min(x) == max(x), 1)])


# Rows with matching entries in columsn Q1, Q2, ..., Q15
98


### Analysis and Regression

In [7]:
# Drop all the columns that are used for feature engineering and cleaning.
df.drop(columns=['Q'+str(i) for i in range(1, 16)], inplace=True) # Drop the specific question information from which GCB is computed
df.drop(columns=['E'+str(i) for i in range(1, 16)], inplace=True) # Timing information
df.drop(columns=['VCL'+str(i) for i in range(1, 17)], inplace=True) # Specific vocabulary questions, rolled into 
df.drop(columns=['total_time_taken_(mins)', 'total_survey_time_taken_(mins)', 
                 'introelapse', 'surveyelapse', 'testelapse'], inplace=True)
df.drop(columns=['major'], inplace=True)

In [8]:
y = df['GCB']
X = df.drop(columns=['GCB'])

# Run a L1-regularization to see which features we should keep and what we can get rid of
lamb_list = np.geomspace(10**-10, 10**5, 16)

# Go through each lambda
#for lamb in lamb_list:
model = lm.OLS(y, X).fit_regularized(alpha=1e-3, L1_wt=1)
beta = model.params
nonzero = np.abs(beta) > 0.01
X = X[X.columns[nonzero.values]]
X['const'] = df['constant']
# X = X.drop(columns=['TIPI4', 'TIPI6', 'TIPI7', 'engnat_1', 'gender_2'])
model = lm.OLS(y, X).fit()
print(model.summary().as_latex())


\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}                &       GCB        & \textbf{  R-squared:         } &     0.139   \\
\textbf{Model:}                        &       OLS        & \textbf{  Adj. R-squared:    } &     0.133   \\
\textbf{Method:}                       &  Least Squares   & \textbf{  F-statistic:       } &     22.29   \\
\textbf{Date:}                         & Wed, 17 Nov 2021 & \textbf{  Prob (F-statistic):} &  3.82e-68   \\
\textbf{Time:}                         &     12:41:16     & \textbf{  Log-Likelihood:    } &    579.98   \\
\textbf{No. Observations:}             &        2495      & \textbf{  AIC:               } &    -1122.   \\
\textbf{Df Residuals:}                 &        2476      & \textbf{  BIC:               } &    -1011.   \\
\textbf{Df Model:}                     &          18      & \textbf{                     } &             \\
\textbf{Covariance Type:}              &    nonrobust     & \textbf{                     }

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['const'] = df['constant']


In [9]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(X.isna().sum())

TIPI2                           0
TIPI5                           0
TIPI6                           0
vocabulary_misclassification    0
STEM                            0
education_2                     0
education_3                     0
urban_3                         0
gender_2                        0
engnat_1                        0
religion_2                      0
religion_3                      0
religion_7                      0
religion_12                     0
orientation_2                   0
orientation_5                   0
voted_2                         0
married_1                       0
constant                        0
const                           0
dtype: int64