# Regression Model

## Load Data

In [1]:
%autosave 20

#basic library
import pandas as pd
import numpy as np
import collections
from collections import defaultdict


#model training
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# visulization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

# Data statistics
from scipy import stats



# print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Autosaving every 20 seconds


In [2]:
df = pd.read_excel('databaseForFunction.xlsx', index_col=0)

In [3]:
df.head()

Unnamed: 0,State,County,Year,Month,PDensity,Population,SNAP_Applications,numberOfWorkers,numberOfDisaster,google_calfresh,google_food_bank,google_food_pantry,google_food_stamps,google_supplemental,google_snap,date_time,last_date_time,last_snap,last_worker,last_disaster,last_google_calfresh,last_google_food_bank,last_google_food_pantry,last_google_food_stamps,last_google_supplemental,last_google_snap
0,California,Alameda,2019,1,1898.5,1559308,5515,45,0,24.0,28.75,38.0,55.5,54.5,0.0,2019-01-01,2018-12-01,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,California,Alameda,2019,2,1898.5,1559308,4478,397,0,27.25,27.75,38.75,31.25,36.75,0.0,2019-02-01,2019-01-01,5515,45,0,24.0,28.75,38.0,55.5,54.5,0.0
2,California,Alameda,2019,3,1898.5,1559308,5041,191,0,19.6,23.2,33.8,31.8,35.2,0.0,2019-03-01,2019-02-01,4478,397,0,27.25,27.75,38.75,31.25,36.75,0.0
3,California,Alameda,2019,4,1898.5,1559308,5253,808,0,23.25,22.5,35.5,22.0,29.25,0.0,2019-04-01,2019-03-01,5041,191,0,19.6,23.2,33.8,31.8,35.2,0.0
4,California,Alameda,2019,5,1898.5,1559308,8074,502,0,35.0,26.75,31.5,29.5,31.25,0.0,2019-05-01,2019-04-01,5253,808,0,23.25,22.5,35.5,22.0,29.25,0.0


## Add one more feature - dummy value for state

In [4]:
# split CA and texas using dummy

# create dummy variables
dummies = pd.get_dummies(df['State'], prefix='State')

# concatenate the dummy variables with the original dataframe
df = pd.concat([df, dummies], axis=1)


In [5]:
# drop State_Texas since using dummy

df = df.drop(columns='State_Texas')

## Update DatabaseForFunction, May 8, 2023
1. Delete column ‘snap_per_capita’
2. Delete column ‘last_google_snap’ and ‘google_snap’ because of overlap meaning with google word ‘supplemental nutrition assistance program’
3. Add seasonal dummy variable
- Summer: whether last month is 6/7/8
- holiday: whether last month is 11/12


In [12]:
df.head()

Unnamed: 0,State,County,Year,Month,PDensity,Population,SNAP_Applications,numberOfWorkers,numberOfDisaster,google_calfresh,google_food_bank,google_food_pantry,google_food_stamps,google_supplemental,date_time,last_date_time,last_snap,last_worker,last_disaster,last_google_calfresh,last_google_food_bank,last_google_food_pantry,last_google_food_stamps,last_google_supplemental,State_California,summer,holiday
0,California,Alameda,2019,1,1898.5,1559308,5515,45,0,24.0,28.75,38.0,55.5,54.5,2019-01-01,2018-12-01,0,0,0,0.0,0.0,0.0,0.0,0.0,1,0,1
1,California,Alameda,2019,2,1898.5,1559308,4478,397,0,27.25,27.75,38.75,31.25,36.75,2019-02-01,2019-01-01,5515,45,0,24.0,28.75,38.0,55.5,54.5,1,0,0
2,California,Alameda,2019,3,1898.5,1559308,5041,191,0,19.6,23.2,33.8,31.8,35.2,2019-03-01,2019-02-01,4478,397,0,27.25,27.75,38.75,31.25,36.75,1,0,0
3,California,Alameda,2019,4,1898.5,1559308,5253,808,0,23.25,22.5,35.5,22.0,29.25,2019-04-01,2019-03-01,5041,191,0,19.6,23.2,33.8,31.8,35.2,1,0,0
4,California,Alameda,2019,5,1898.5,1559308,8074,502,0,35.0,26.75,31.5,29.5,31.25,2019-05-01,2019-04-01,5253,808,0,23.25,22.5,35.5,22.0,29.25,1,0,0


In [8]:
# 3. Add seasonal dummy variables

# 3.1 add 'summer' -> whether last month is 6/7/8
# 3.2 add 'holiday' -> whether last month is 11/12
summer_month = [6, 7, 8]
holiday_month = [11, 12]


df['summer'] = 0
df['holiday'] = 0

for i in range(len(df)):
    if df.iloc[i,16].month in summer_month:
        df.iloc[i, -2] = 1
        
    if df.iloc[i,16].month in holiday_month:
        df.iloc[i, -1] = 1

In [11]:
# 1. Delete column 'snap_per_capita'
# 2. Delete columms related with google research work snap application

df = df.drop(columns=['google_snap','last_google_snap'])

### updated database May 8, 2023

In [13]:
df.to_excel('databaseForFunction_May8.xlsx')

## Choose columns to train 
- use last month date to predict this month's snap applications

In [14]:
col_keepinmodel = list(df.columns)

In [15]:
coltomove = ['State','County','Year','Month', 'date_time','last_date_time', \
             'numberOfWorkers', 'numberOfDisaster','google_calfresh',\
             'google_food_bank','google_food_pantry','google_food_stamps',\
             'google_supplemental']

for c in coltomove:
    col_keepinmodel.remove(c)
    
col_keepinmodel

['PDensity',
 'Population',
 'SNAP_Applications',
 'last_snap',
 'last_worker',
 'last_disaster',
 'last_google_calfresh',
 'last_google_food_bank',
 'last_google_food_pantry',
 'last_google_food_stamps',
 'last_google_supplemental',
 'State_California',
 'summer',
 'holiday']

In [16]:
df = df[col_keepinmodel]

In [17]:
df.head()

Unnamed: 0,PDensity,Population,SNAP_Applications,last_snap,last_worker,last_disaster,last_google_calfresh,last_google_food_bank,last_google_food_pantry,last_google_food_stamps,last_google_supplemental,State_California,summer,holiday
0,1898.5,1559308,5515,0,0,0,0.0,0.0,0.0,0.0,0.0,1,0,1
1,1898.5,1559308,4478,5515,45,0,24.0,28.75,38.0,55.5,54.5,1,0,0
2,1898.5,1559308,5041,4478,397,0,27.25,27.75,38.75,31.25,36.75,1,0,0
3,1898.5,1559308,5253,5041,191,0,19.6,23.2,33.8,31.8,35.2,1,0,0
4,1898.5,1559308,8074,5253,808,0,23.25,22.5,35.5,22.0,29.25,1,0,0


# Build the Model

#### 1. check the correlation

In [18]:
df.corr()['SNAP_Applications']

PDensity                    0.631942
Population                  0.933607
SNAP_Applications           1.000000
last_snap                   0.947601
last_worker                 0.392523
last_disaster               0.036997
last_google_calfresh        0.192687
last_google_food_bank       0.004543
last_google_food_pantry     0.011594
last_google_food_stamps     0.045971
last_google_supplemental    0.043198
State_California            0.217959
summer                      0.004451
holiday                    -0.006436
Name: SNAP_Applications, dtype: float64

In [19]:
X = df.drop(columns=['SNAP_Applications'])
y = df.SNAP_Applications

#### 2. Result of the linear regression model

In [20]:
import statsmodels.api as sm
import pandas as pd

# Add constant term to X
X = sm.add_constant(X)

# Fit linear regression model
model = sm.OLS(y, X).fit()

# Print summary of regression results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:      SNAP_Applications   R-squared:                       0.925
Model:                            OLS   Adj. R-squared:                  0.925
Method:                 Least Squares   F-statistic:                 1.040e+04
Date:                Mon, 08 May 2023   Prob (F-statistic):               0.00
Time:                        17:24:40   Log-Likelihood:                -94156.
No. Observations:               10908   AIC:                         1.883e+05
Df Residuals:                   10894   BIC:                         1.884e+05
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

For question SVD did not converge

https://blog.csdn.net/lijieling123/article/details/112910530