In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as mse
import numpy as np

## 1. Figure out your question

The question we want to answer using machine learning is: Can naming trends accurately predict religiosity in a region?

## 2. Obtain a labeled dataset

The name data we are using comes from the Social Security Administration Database. 
The religous data is compiled from various sources that include: Religious Landscape Survey, American National Election Studies, Churches and Church Membership in the United States, History of American Religion, and the United States Census of American Religion. 
The Bible names were scraped from Wikipedia. 

In [2]:
df = pd.read_csv("TestTrainData.csv")
print(df.head())

   stateID  year  percentbible  percentAaliyah  percentAaron  percentAbbey  \
0        2  1916      0.290625             0.0      0.000000           0.0   
1        2  1926      0.415648             0.0      0.000000           0.0   
2        2  1971      0.201347             0.0      0.006558           0.0   
3        2  1980      0.253605             0.0      0.008438           0.0   
4        2  1990      0.236164             0.0      0.007269           0.0   

   percentAbbie  percentAbbigail  percentAbby  percentAbdirahman  ...  \
0             0              0.0          0.0                  0  ...   
1             0              0.0          0.0                  0  ...   
2             0              0.0          0.0                  0  ...   
3             0              0.0          0.0                  0  ...   
4             0              0.0          0.0                  0  ...   

   percentZelma  percentZia  percentZion  percentZoe  percentZoey  \
0             0        

In [3]:
rel = df.fillna(df.mean())
rel

Unnamed: 0,stateID,year,percentbible,percentAaliyah,percentAaron,percentAbbey,percentAbbie,percentAbbigail,percentAbby,percentAbdirahman,...,percentZelma,percentZia,percentZion,percentZoe,percentZoey,percentZola,percentZora,percentZuri,percentZyaire,christian
0,2,1916,0.290625,0.000000,0.000000,0.0,0,0.0,0.000000,0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
1,2,1926,0.415648,0.000000,0.000000,0.0,0,0.0,0.000000,0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
2,2,1971,0.201347,0.000000,0.006558,0.0,0,0.0,0.000000,0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.370368
3,2,1980,0.253605,0.000000,0.008438,0.0,0,0.0,0.000000,0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
4,2,1990,0.236164,0.000000,0.007269,0.0,0,0.0,0.000000,0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,56,1980,0.235450,0.000000,0.007135,0.0,0,0.0,0.000000,0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
487,56,1990,0.246820,0.000000,0.007339,0.0,0,0.0,0.000000,0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
488,56,2000,0.226064,0.000000,0.003324,0.0,0,0.0,0.001662,0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
489,56,2007,0.214286,0.000000,0.003599,0.0,0,0.0,0.000000,0,...,0,0,0.0,0.003045,0.000000,0.0,0,0,0,0.807960


In [4]:
y = rel['christian']
X = rel[[x for x in rel.columns if x !='christian']]

## 3. Divide into training and set sets

In [5]:
xTrain, xTest, yTrain, yTest = train_test_split(X, y,test_size=0.33,random_state=42)

## 4. Pick an appropriate method
We will begin with Lasso.

In [6]:
lasso = Lasso(alpha=0.001, max_iter=100000).fit(xTrain, yTrain)
print("Training set score: {:.2f}".format(lasso.score(xTrain, yTrain)))
print("Test set score: {:.2f}".format(lasso.score(xTest, yTest)))
print("Number of features used:", np.sum(lasso.coef_ != 0))
scores = cross_val_score(lasso,xTrain,yTrain,cv=5)
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))


Training set score: 0.26
Test set score: 0.23
Number of features used: 2
Cross-validation scores: [0.18503724 0.37649983 0.27201033 0.1713364  0.19152656]
Average cross-validation score: 0.24


## 5. Choose regularization parameters via cross-validation on the training set

In [7]:
alpha_grid = {'alpha': [.0001, .001, .002, .004, .006, .008, .01, .012, .014, .016 ,.018, .02 ],'max_iter': [100000]}
grid_search = GridSearchCV(Lasso(),alpha_grid,cv=5,return_train_score=True)
best_model=grid_search.fit(xTrain,yTrain)
print("Best alpha: ",best_model.best_estimator_.get_params()['alpha'])

Best alpha:  0.0001




In [8]:
lasso = Lasso(alpha=0.0001, max_iter=100000).fit(xTrain, yTrain)
print("Training set score: {:.2f}".format(lasso.score(xTrain, yTrain)))
print("Test set score: {:.2f}".format(lasso.score(xTest, yTest)))
print("Number of features used:", np.sum(lasso.coef_ != 0))

Training set score: 0.37
Test set score: 0.31
Number of features used: 6


## 6. Fit model on whole training set using the cross-validated parameters

In [9]:
lasso.fit(xTrain,yTrain)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=100000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

## 7. Evaluate model by applying it to test set

In [10]:
lasso.predict(xTest)
print("Test set score: {:.2f}".format(lasso.score(xTest, yTest)))

Test set score: 0.31


## 8. Repeat 4-7 for other methods

In [12]:
#Ridge regression
alpha_grid = {'alpha': [.0001, .001, .002, .004, .006, .008, .01, .012, .014, .016 ,.018, .02 ],'max_iter': [100000]}
grid_search = GridSearchCV(Ridge(),alpha_grid,cv=5,return_train_score=True)
best_model=grid_search.fit(xTrain,yTrain)
print("Best alpha: ",best_model.best_estimator_.get_params()['alpha'])

Best alpha:  0.0001




In [15]:
ridge = Ridge(alpha=.0001).fit(xTrain, yTrain)
print("Training set score: {:.2f}".format(ridge.score(xTrain, yTrain)))
print("Test set score: {:.2f}".format(ridge.score(xTrain, yTrain)))

Training set score: 0.96
Test set score: 0.96


In [16]:
#Lasso and Ridge excluding Bible variable
rel2 = rel.drop(columns=['percentbible'])
rel2

Unnamed: 0,stateID,year,percentAaliyah,percentAaron,percentAbbey,percentAbbie,percentAbbigail,percentAbby,percentAbdirahman,percentAbel,...,percentZelma,percentZia,percentZion,percentZoe,percentZoey,percentZola,percentZora,percentZuri,percentZyaire,christian
0,2,1916,0.000000,0.000000,0.0,0,0.0,0.000000,0,0.0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
1,2,1926,0.000000,0.000000,0.0,0,0.0,0.000000,0,0.0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
2,2,1971,0.000000,0.006558,0.0,0,0.0,0.000000,0,0.0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.370368
3,2,1980,0.000000,0.008438,0.0,0,0.0,0.000000,0,0.0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
4,2,1990,0.000000,0.007269,0.0,0,0.0,0.000000,0,0.0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,56,1980,0.000000,0.007135,0.0,0,0.0,0.000000,0,0.0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
487,56,1990,0.000000,0.007339,0.0,0,0.0,0.000000,0,0.0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
488,56,2000,0.000000,0.003324,0.0,0,0.0,0.001662,0,0.0,...,0,0,0.0,0.000000,0.000000,0.0,0,0,0,0.588436
489,56,2007,0.000000,0.003599,0.0,0,0.0,0.000000,0,0.0,...,0,0,0.0,0.003045,0.000000,0.0,0,0,0,0.807960


In [17]:
y = rel2['christian']
X = rel2[[x for x in rel2.columns if x !='christian']]
x_Train, x_Test, y_Train, y_Test = train_test_split(X, y,test_size=0.33,random_state=42)

In [18]:
alpha_grid = {'alpha': [.0001, .001, .002, .004, .006, .008, .01, .012, .014, .016 ,.018, .02 ],'max_iter': [100000]}
grid_search = GridSearchCV(Lasso(),alpha_grid,cv=5,return_train_score=True)
best_model=grid_search.fit(x_Train,y_Train)
print("Best alpha: ",best_model.best_estimator_.get_params()['alpha'])

Best alpha:  0.0001




In [19]:
lasso = Lasso(alpha=0.0001, max_iter=100000).fit(x_Train, y_Train)
print("Training set score: {:.2f}".format(lasso.score(x_Train, y_Train)))
print("Test set score: {:.2f}".format(lasso.score(x_Test, y_Test)))
print("Number of features used:", np.sum(lasso.coef_ != 0))

Training set score: 0.37
Test set score: 0.31
Number of features used: 5


In [21]:
ridge = Ridge(alpha=.0001).fit(x_Train, y_Train)
print("Training set score: {:.2f}".format(ridge.score(x_Train, y_Train)))
print("Test set score: {:.2f}".format(ridge.score(x_Train, y_Train)))

Training set score: 0.96
Test set score: 0.96


## 9. Apply the chosen method to new observations for which we have no labels

In [29]:
out = pd.read_csv("ApplyData.csv")

In [31]:
lasso.predict(out)

TypeError: 'DataFrame' object is not callable