In [10]:
import os
import pandas as pd
import numpy as np
import warnings
import datetime as dt
import statsmodels.formula.api as smf
warnings.filterwarnings('ignore')

In [15]:
def Create_Data_Logit(PATH, cols_to_use):
    
    # Keep the data we need
    
    # Load the csv with only the desired columns
    df = pd.read_csv(PATH, usecols = cols_to_use)
    # Drop the Nan values
    df.dropna(inplace = True)
    # Only keep hispanic, black and white people
    df = df.iloc[[x in ['hispanic', 'black', 'white'] for x in df.subject_race]]
    # Convert date to ordinal
    df.date = pd.to_datetime(df.date)
    df.date = df.date.map(dt.datetime.toordinal)
    # Only keep data after 2009
    df = df[df.date > dt.datetime.toordinal(dt.datetime(2009,1,1))]
    
    # Create the search_rates
    
    # Make search_conducted be 0 or 1
    df.search_conducted =  df.search_conducted.astype(int)
    # Train a model
    res = smf.logit(formula = 'search_conducted ~ date + C(subject_race):C(subject_sex)', data = df).fit()
    # Get the search_rates
    p = res.predict(df[['date', 'subject_race', 'subject_sex']])
    df['search_rate'] = p
    # No need for search_conducted anymore
    df.drop('search_conducted', axis = 1, inplace = True)
    # Make the date be datetime again
    df.date = df.date.map(dt.date.fromordinal)
    df.date = pd.to_datetime(df.date)
    
    return df

In [16]:
PATH = 'Not_Zip/'

In [17]:
States = ['AZ', 'CA', 'CT', 'IL', 'NC', 'OH', 'RI', 'SC', 'TX', 'WI']
cols_to_use = ['date', 'subject_race', 'subject_sex', 'search_conducted']

In [18]:
for state in States:
    
    df = Create_Data_Logit(PATH+state+'.csv', cols_to_use)
    df.to_csv('Logit_Files/'+state+'.csv')

Optimization terminated successfully.
         Current function value: 0.218610
         Iterations 21
Optimization terminated successfully.
         Current function value: 0.154180
         Iterations 9
         Current function value: 0.160390
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.184287
         Iterations 8
         Current function value: 0.114237
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.140794
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.148930
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.125029
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.089807
         Iterations 18
Optimization terminated successfully.
         Current function value: 0.077473
         Iterations 16


In [18]:
def groupyweek(df):
    df = df.groupby(['subject_race', pd.Grouper(key = 'date', freq = '1W')]).mean()
    return df