In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as mse

## 1. Figure out your question

The question we want to answer using machine learning is: Can naming trends accurately predict religiosity in a region?

## 2. Obtain a labeled dataset

The name data we are using comes from the Social Security Administration Database. 
The religous data is compiled from various sources that include: Religious Landscape Survey, American National Election Studies, Churches and Church Membership in the United States, History of American Religion, and the United States Census of American Religion. 
The Bible names were scraped from Wikipedia. 

In [None]:
df = pd.read_csv(".csv")
print(df.head())


In [None]:
y = df['']
X = df[[x for x in df.columns if x !='']]

## 3. Divide into training and set sets

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(X, y,test_size=0.33,random_state=42)

## 4. Pick an appropriate method
We will begin with Lasso.

In [None]:
lasso = Lasso(alpha=0.001, max_iter=100000).fit(xTrain, yTrain)
scores = cross_val_score(lasso,xTrain,yTrain,cv=5)
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))


## 5. Choose regularization parameters via cross-validation on the training set

In [None]:
alpha_grid = {'alpha': [.0001, .001, .002, .004, .006, .008, .01, .012, .014, .016 ,.018, .02 ],'max_iter': [100000]}
grid_search = GridSearchCV(Lasso(),alpha_grid,cv=5,return_train_score=True)
best_model=grid_search.fit(xTrain,yTrain)
print("Best alpha: ",best_model.best_estimator_.get_params()['alpha'])

## 6. Fit model on whole training set using the cross-validated parameters

In [None]:
lasso.fit(xTrain,yTrain)

## 7. Evaluate model by applying it to test set

In [None]:
lasso.predict(xTest)
print("Accuracy on test set: {:.3f}".format(tree.score(xTest, yTest)))

## 8. Repeat 4-7 for other methods

In [None]:
#Ridge regression
ridge = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))
print(ridge.coef_)

#include Bible variable


## 9. Apply the chosen method to new observations for which we have no labels