## Dimensionality Reduction

In [None]:
X_train, X_test, y_train, y_test = train_test_split(merged1[features_to_include], merged1['mobile success'], test_size=0.2)

In [None]:
from sklearn.decomposition import PCA

scaler_PCA = StandardScaler()
scaler_PCA.fit(X_train)
Xscaled = scaler_PCA.transform(X_train)

pca = PCA(n_components=20)
pca.fit(Xscaled)

X_reduced = pca.transform(Xscaled)

In [None]:
np.sum(pca.explained_variance_ratio_)

In [None]:
# Create a PCA model to reduce our data to 2 dimensions for visualisation
pca = PCA(n_components=2)
pca.fit(Xscaled)

 #Transform the scaled data to the new PCA space
X_reduced = pca.transform(Xscaled)

In [None]:
X_reduceddf = pd.DataFrame(X_reduced, index=X_train.index, columns=['PC1','PC2'])

In [None]:
centres_reduced = pca.transform(kmeans.cluster_centers_)

In [None]:
def display_factorial_planes(X_projected, n_comp, pca, axis_ranks, labels=None, alpha=1, illustrative_var=None):
    '''Display a scatter plot on a factorial plane, one for each factorial plane'''

    # For each factorial plane
    for d1,d2 in axis_ranks:
        if d2 < n_comp:
 
            # Initialise the matplotlib figure      
            fig = plt.figure(figsize=(7,6))
        
            # Display the points
            if illustrative_var is None:
                plt.scatter(X_projected[:, d1], X_projected[:, d2], alpha=alpha)
            else:
                illustrative_var = np.array(illustrative_var)
                for value in np.unique(illustrative_var):
                    selected = np.where(illustrative_var == value)
                    plt.scatter(X_projected[selected, d1], X_projected[selected, d2], alpha=alpha, label=value)
                plt.legend()

            # Display the labels on the points
            if labels is not None:
                for i,(x,y) in enumerate(X_projected[:,[d1,d2]]):
                    plt.text(x, y, labels[i],
                              fontsize='14', ha='center',va='center') 
                
            # Define the limits of the chart
            boundary = np.max(np.abs(X_projected[:, [d1,d2]])) * 1.1
            plt.xlim([-boundary,boundary])
            plt.ylim([-boundary,boundary])
        
            # Display grid lines
            plt.plot([-100, 100], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-100, 100], color='grey', ls='--')

            # Label the axes, with the percentage of variance explained
            plt.xlabel('PC{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('PC{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Projection of points (on PC{} and PC{})".format(d1+1, d2+1))
            #plt.show(block=False)

In [None]:
display_factorial_planes(X_reduced, 3, pca, [(0,1)], alpha = 0.8)
plt.scatter(centres_reduced[:, 0], centres_reduced[:, 1],
            marker='x', s=169, linewidths=3,
            color='r', zorder=10)

## 2 cluster


In [None]:
X_train, X_test, y_train, y_test = train_test_split(merged1[features_to_include], merged1['mobile success'], test_size=0.2)

In [None]:
scaler_kmeans = StandardScaler()
scaler_kmeans.fit(X_train)
Xscaled = scaler_kmeans.transform(X_train)

In [None]:
kmeans = KMeans(n_clusters = 2)
kmeans = kmeans.fit(Xscaled)

In [None]:
kmeans.labels_

In [None]:
y_predict = kmeans.predict(Xscaled)

In [None]:
accuracy_score(kmeans.labels_,y_train)

In [None]:
accuracy_score(y_train,y_predict)

## Pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(merged1[features_to_include], merged1['multi_absolutemobilitypercentile'], test_size=0.2)

In [None]:
steps = [('scaler', StandardScaler()), ('decisiontree',tree.DecisionTreeClassifier())]

In [None]:
pipe = Pipeline(steps)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_test,y_test)

## Confusion Matrix

In [None]:
#importing a 3-class dataset from sklearn's toy dataset
from sklearn.datasets import load_wine

dataset = load_wine()
X = dataset.data
y = dataset.target
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
svc = SVC(kernel='rbf', C=1).fit(X_train, y_train)
y_pred = svc.predict(X_test)

#importing confusion matrix
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(confusion)

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2', 'Class 3']))


## LIME

In [None]:
import lime
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lime.lime_tabular import LimeTabularExplainer
from sklearn.pipeline import make_pipeline

In [None]:
lgb_params = {
    'task': 'train',
    'boosting_type': 'goss',
    'objective': 'binary',
    'metric':'binary_logloss',
    'metric': {'l2', 'auc'},
    'num_leaves': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbose': None,
    'num_iteration':100,
    'num_threads':7,
    'max_depth':12,
    'min_data_in_leaf':100,
    'alpha':0.5}

In [None]:
le = LabelEncoder()

In [None]:
df_lime = merged1.drop(columns=['cz name','state'])
df_lime['multi_absolutemobilitypercentile'] = le.fit_transform(df_lime['multi_absolutemobilitypercentile'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_lime[features_to_include], df_lime['multi_absolutemobilitypercentile'], test_size=0.2)

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test)

In [None]:
model = lgb.train(lgb_params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5)


In [None]:
type(model)

In [None]:
model.feature_importance()

In [None]:
X_trainarray = X_train.to_numpy()

In [None]:
explainer = LimeTabularExplainer(X_trainarray,feature_names=X_train[features_to_include],
                                class_names=['0-25','25-50','50-75','75-100'],
                                mode='classification',
                                categorical_features=X_train[features_to_include])

In [None]:
#X_test = np.array(X_test).reshape(-1, 1)

In [None]:
#exp = explainer.explain_instance(X_test[1], model.predict)


In [None]:
#exp.as_pyplot_figure()

## Folium

In [None]:
merged1['combined'] = merged1.apply(lambda x: list([x['lon'],x['lat'],x['am, 80-82 cohort']]),axis=1)   
#merged1['combined'] = merged1.apply(lambda x: list([x['lon'],x['lat']]),axis=1)    
heatmaplist = merged1['combined'].tolist()
heatmaplist

In [None]:
merged1['combined'] = merged1[['lat','lon','am, 80-82 cohort']].values.tolist()
merged1['combined']

In [None]:
hme = HeatMap(data=merged1['combined'],gradient={.2: 'blue', .65: 'lime', 1: 'red'})
hme.add_to(m)
m

In [None]:
from geopy.geocoders import Nominatim
import folium 
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime


In [None]:
merged1['cz name']

In [None]:
citycoords = []
citycoordslon = []
citycoordslat = []
for k in merged1['cz name']:
    geolocator = Nominatim(user_agent="Your_Name")
    loc = geolocator.geocode(k).raw
    citycoords.append((loc['lat'], loc['lon']))
    citycoordslat.append((loc['lat']))
    citycoordslon.append((loc['lon']))

In [None]:
centerofamericacoords = (39.8097, -98.5556) #lebanon,kansas lol??
m = folium.Map(location=centerofamericacoords, tiles = 'stamentoner', zoom_start=4, control_scale=True)

In [None]:
for index, row in merged1.iterrows():
    folium.CircleMarker(row['citycoords'],
                        radius=1,
                        popup=row['cz name'],
                        fill_color="#3db7e4", 
                       ).add_to(m)

m

In [None]:
hm = HeatMap(data=merged1['citycoords'])
hm.add_to(m)
m

## SHAP feature importance

In [None]:
import shap
shap.initjs()

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(merged1[features_to_include], merged1['multi_absolutemobilitypercentile'], test_size=0.2)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(merged1[features_to_include], merged1['mobile success'], test_size=0.2)


In [None]:
#f = lambda x: binarylogregCV.predict_proba(x)[:,1]
#med = X_train.median().values.reshape((1,X_train.shape[1]))
#explainer = shap.KernelExplainer(f, med)
#shap_values_single = explainer.shap_values(merged1[features_to_include].iloc[0,:], nsamples=1000)
#shap.force_plot(explainer.expected_value, shap_values_single, merged1[features_to_include].iloc[0,:])

In [None]:
#scaler_reg = StandardScaler().fit(X_train)
#X_train = scaler_reg.transform(X_train)
#X_test = scaler_reg.transform(X_test)

In [None]:
#explainer = shap.LinearExplainer(binarylogregCV, X_train,feature_dependence="independent")

In [None]:
#shap_values = explainer.shap_values(X_test)

In [None]:
#shap.summary_plot(shap_values, X_test)

## Hyperparameter tuning stuff

In [None]:
C = np.logspace(0, 10, num=20)
l1 = np.arange(0,1,.1)
penalty = ['elasticnet']
solver = ['liblinear','saga']
hyperparameters = dict(Cs=[C], penalty=penalty, solver=solver,l1_ratios=[l1])
logistic = LogisticRegression()
gridsearch = GridSearchCV(logistic, hyperparameters,cv=3)
gs = gridsearch.fit(X_train,y_train)


In [None]:
print("Best Estimator: \n{}\n".format(gs.best_estimator_))
print("Best Parameters: \n{}\n".format(gs.best_params_))
print("Best Score: \n{}\n".format(gs.best_score_))