In [58]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [59]:
df = pd.read_csv("conspiracy_theories_data_orig.csv")

# Only NaN values are in "major" column, so no other cleaning is necessary
# Benefit of working with survey data as opposed to data collected using messier methods
# TODO: check for survey responses that don't make sense (answered just the default answer for all 
# questions); these should be thrown out

In [60]:
# Measure for General Conspiracy Belief. Normalized average of responses to questions 1-15 of survey
df['GCB'] = df[['Q'+str(i) for i in range(1, 16)]].mean(axis=1) / 5

In [61]:
# The survey asked participants what words they knew. Columns VCL6, VCL9, VCL12 were not real words, and were included in 
# order to perform a validity check

df['validity'] = df[['VCL6', 'VCL9', 'VCL12']].mean(axis=1)
df['vocabulary_knowledge'] = df[['VCL' + str(i) for i in [1, 2, 3, 4, 5, 7, 8, 10, 11, 13, 14, 15, 16]]].mean(axis=1)

In [78]:
#I split up every instance of "major" to a category: HUM (Humanities), BUS (business/law), ART, STEM, and OTHER. 
#This block creates a one-hot encoding for each of these.
names = ["STEM", "HUM", "BUS", "OTHER", "ART"]
for name in names:
    tf = open(f"{name}.txt", "r",newline='\n')
    majors = [i[:-2] for i in tf.readlines()]
    def func(x):
        return int(x in majors)
    func = np.vectorize(func)
    df[name] = 1
    df[name] = df.major.apply(func)

In [79]:
df.STEM.unique()

array([0, 1], dtype=int64)

In [80]:
df[:20]

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,familysize,major,GCB,validity,vocabulary_knowledge,STEM,HUM,BUS,OTHER,ART
0,5,5,3,5,5,5,5,3,4,5,...,1,ACTING,0.906667,0.0,0.846154,0,0,0,0,1
1,5,5,5,5,5,3,5,5,1,4,...,1,,0.866667,0.0,0.692308,0,0,0,0,0
2,2,4,1,2,2,2,4,2,2,4,...,2,philosophy,0.493333,0.333333,1.0,0,1,0,0,0
3,5,4,1,2,4,5,4,1,4,5,...,3,history,0.733333,0.0,0.846154,0,1,0,0,0
4,5,4,1,4,4,5,4,3,1,5,...,2,Psychology,0.786667,0.0,0.692308,0,1,0,0,0
5,1,1,1,1,1,1,1,1,1,1,...,2,nursing,0.2,0.666667,0.846154,1,0,0,0,0
6,4,3,3,3,4,3,3,4,2,3,...,2,,0.613333,0.0,0.692308,0,0,0,0,0
7,5,4,3,3,4,5,5,5,5,5,...,3,,0.88,0.0,0.615385,0,0,0,0,0
8,1,1,1,1,1,1,1,1,1,1,...,2,,0.2,0.0,0.692308,0,0,0,0,0
9,1,2,1,1,1,5,1,1,1,4,...,3,it,0.36,0.0,0.769231,1,0,0,0,0
