In [1]:
import numpy as np
import pandas as pd
import zipcodes
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import random

random.seed(0)
np.random.seed(0)

INFILE = '../data/pennycook_et_al_study2_raw.csv'
OUTFILE = '../data/pennycook_et_al_study2_clean.csv'

In [2]:
df = pd.read_csv(INFILE)
df.head()

Unnamed: 0,Condition,Fake,Real,Discern,CRT_ACC,SciKnow,MMS,ResponseID,ResponseSet,Name,...,site_gdp_30,min_dist_15,site_pop_15,site_gdp_15,min_dist_10,site_pop_10,site_gdp_10,min_dist_1,site_pop_1,site_gdp_1
0,2,5.133333,5.333333,0.2,0.0,0.588235,4.7,R_3J34Pa60coQnKDu,Default Response Set,Anonymous,...,13056917.0,325.56564,230163.0,11564651.0,325.56564,381951.0,17471690.0,74.903283,76351.0,2460726.0
1,1,2.266667,2.933333,0.666667,0.333333,0.588235,5.3,R_2s6gfRcfu9jRhK0,Default Response Set,Anonymous,...,303694235.0,331.814277,2233163.0,303694235.0,331.814277,2233163.0,303694235.0,161.218613,97331.0,5790087.0
2,1,1.0,1.0,0.0,0.0,0.352941,4.5,R_3jTm56ryzVTu3Ic,Default Response Set,Anonymous,...,1101987.0,1330.622598,128206.0,8114344.0,9.413242,230163.0,11564651.0,9.413242,716492.0,77455208.0
3,1,1.0,1.0,0.0,0.333333,0.764706,2.5,R_1P0Oso567HTKLNE,Default Response Set,Anonymous,...,1101987.0,339.17539,128206.0,8114344.0,31.783372,191659.0,10014469.0,31.783372,2231647.0,122423873.0
4,1,4.933333,4.733333,-0.2,0.0,0.529412,5.2,R_1isxtrw6MT71zyt,Default Response Set,Anonymous,...,1101987.0,1284.563436,230163.0,11564651.0,702.437648,44527.0,1940909.0,11.04599,2637772.0,264438886.0


In [3]:
# one entry was -3105
df['education'] = df.education.replace(-3105, None)

# Sharing type 5 and 6 coded as 6 and 7, respectively
df['SharingType_5'] = df['SharingType_6']
df['SharingType_6'] = df['SharingType_7']
df = df.drop(columns='SharingType_7')
# SharingType_* missing values are nan; replace with 0
for i in range(1, 7):
    var = 'SharingType_'+str(i)
    df[var] = df[var].fillna(0)
    
# SocialMedia_* missing values are nan; replace with 0
for i in range(1, 7):
    var = 'SocialMedia_'+str(i)
    df[var] = df[var].fillna(0)
    
# Ethniticy_* missing values are nan; replace with 0
for i in range(1, 7):
    var = 'Ethnicity_'+str(i)
    df[var] = df[var].fillna(0)
    
df = df.rename(columns={
    'COVID_concern_1': 'COVID_concern',
    'DemRep_C': 'Partisan',
    'Age.0': 'Age',
    'Education_1': 'Education'
})
df['Treatment'] = df.Condition == 2
# COVID_news originally coded 1, 2, 3, 4, and 7. Change to 1, 2, 3, 4, and 5
df['COVID_news'].iloc[df.COVID_news==7] = 5
# indicate correct CRT responses
df['CRT1_1_corr'] = df.CRT1_1 == 4
df['CRT1_1_intuit'] = df.CRT1_1 == 8
df['CRT1_2_corr'] = df.CRT1_2 == 10
df['CRT1_2_intuit'] = df.CRT1_2 == 50
df['CRT1_3_corr'] = df.CRT1_3 == 39
df['CRT1_3_intuit'] = df.CRT1_3 == 20
df['CRT3_1_corr'] = df.CRT3_1 == 2
df['CRT3_1_intuit'] = df.CRT3_1 == 1
df['CRT3_2_corr'] = df.CRT3_2 == 8
df['CRT3_2_intuit'] = df.CRT3_2 == 7
# CRT3_3 is miscoded; asks for a text response but all entries are numeric responses
# df['CRT3_3_corr'] = df.CRT3_3.str.lower().strip() == 'emily'
df = df.drop(columns=['CRT3_3'])
df['CRT_chk'] = df.CRT_chk == 1
df['Party'] = df.Party.map({
    1: 'Democrat',
    2: 'Republican',
    3: 'Independent',
    4: 'Other'
})
df['POTUS2016'] = df.POTUS2016.map({
    1: 'Clinton',
    2: 'Trump',
    3: 'Other',
    4: 'Unable to vote',
    5: "Didn't vote but could have",
    6: "Didn't vote in protest"
})
df['Male'] = df['Gender.0'] == 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [4]:
def geography(zipcode, key):
    try:
        loc = zipcodes.matching(str(int(zipcode)))[0]
        return loc[key]
    except:
        return None

# df['State'] = df.Zipcode.apply(lambda x: geography(x, 'state'))
# df['County'] = df.Zipcode.apply(lambda x: geography(x, 'county'))
# df['City'] = df.Zipcode.apply(lambda x: geography(x, 'city'))
# df['Lat'] = df.Zipcode.apply(lambda x: geography(x, 'lat'))
# df['Long'] = df.Zipcode.apply(lambda x: geography(x, 'long'))

In [5]:
def col_startswith(startswith):
    return [col for col in df.columns if col.startswith(startswith)]

df = df[
    col_startswith('SharingType')
    + col_startswith('SocialMedia')
    + col_startswith('CRT')
    + col_startswith('sci')
    + col_startswith('mms')
    + col_startswith('Media')
    + col_startswith('Ethnicity')
    + col_startswith('Fake')
    + col_startswith('Real')
    + [
        'COVID_concern',
        'COVID_news',
        'AccImp',
        'Age',
        'Male',
        'Education',
        'Income',
        'English',
        'Party',
        'Partisan',
        'Social_Conserv',
        'Economic_Conserv',
        'POTUS2016',
        'Treatment',
#         'State',
#         'County',
#         'City',
#         'Lat',
#         'Long',
#         'Random',
#         'Google',
#         'CRT_Rand'
    ]
]
df.head()

Unnamed: 0,SharingType_1,SharingType_2,SharingType_3,SharingType_4,SharingType_6,SharingType_7_TEXT,SharingType_5,SocialMedia_1,SocialMedia_2,SocialMedia_3,...,Male,Education,Income,English,Party,Partisan,Social_Conserv,Economic_Conserv,POTUS2016,Treatment
0,1.0,1.0,1.0,1.0,0.0,,1.0,1.0,1.0,1.0,...,True,17.0,9.0,1,Democrat,3.0,1,2,Clinton,True
1,1.0,0.0,1.0,0.0,0.0,,0.0,1.0,1.0,0.0,...,False,19.0,3.0,1,Republican,5.0,4,4,Trump,False
2,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0,...,False,16.0,7.0,1,Independent,3.0,2,2,Clinton,False
3,0.0,1.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0,...,True,13.0,4.0,1,Democrat,2.0,4,4,Clinton,False
4,1.0,1.0,1.0,0.0,0.0,,1.0,1.0,1.0,1.0,...,True,14.0,6.0,1,Democrat,1.0,5,5,Clinton,False


In [6]:
df = df.drop(columns=(
    col_startswith('CRT_ACC')
    + col_startswith('Real1_RT')
    + col_startswith('Fake1_RT')
    + [
        'SharingType_7_TEXT',
        'SocialMedia_6_TEXT',
        'CRT_Inst',
        'CRT_Thomson',
        'Ethnicity_6_TEXT',
        'Real', # original authors drop Real and Fake, so I follow this here
        'Fake'
    ]
))

In [7]:
# keep only participants who responded to all items
df = df.dropna(subset=[
    col for col in df.columns 
    if col.startswith('Real') or col.startswith('Fake') or col=='Treatment'
])
# calculate difference in average preference for sharing real over fake news
df['Real'] = df[[col for col in df.columns if col.startswith('Real')]].mean(axis=1)
df['Fake'] = df[[col for col in df.columns if col.startswith('Fake')]].mean(axis=1)
df['Diff'] = (df['Fake'] - df['Real'])
# drop columns used to construct the dependent variable
df = df.drop(columns=[
    col for col in df.columns
    if col.startswith('Real1') or col.startswith('Fake1')
])
df = df.drop(columns=['Real', 'Fake'])

In [8]:
dummy_cols = [
    'Party',
    'POTUS2016',
]
df = pd.get_dummies(df, columns=dummy_cols)

In [9]:
# impute missing values for other measures
X_df = df.drop(columns='Diff')
X = IterativeImputer().fit_transform(X_df)
imputed_df = pd.DataFrame(columns=X_df.columns, data=X)
imputed_df['Diff'] = df.Diff.values

In [11]:
def combine_cols(df, prefix):
    cols = [col for col in df.columns if col.startswith(prefix+'_')]
    values = {i: col[len(prefix+'_'):] for i, col in enumerate(cols)}
    x = pd.Series(np.argmax(df[cols].values, axis=1))
    df[prefix] = x.map(values)
    return df.drop(columns=cols)

for col in dummy_cols:
    imputed_df = combine_cols(imputed_df, col)
    
# drop the CRT columns which indicate correct or intuitive answers
# these will be added in the preprocessor
imputed_df = imputed_df.drop(columns=[
    col for col in imputed_df.columns 
    if col.startswith('CRT') and (col.endswith('corr') or col.endswith('intuit'))
])

In [13]:
missing_cols = imputed_df.columns[imputed_df.isna().any()].tolist()
print('Dropping columns with missing values: ', missing_cols)
imputed_df = imputed_df.drop(columns=missing_cols)

Dropping columns with missing values:  []


In [14]:
# save
imputed_df.to_csv(OUTFILE, index=False)