https://github.com/dbabichenko/python_for_data_and_analytics/tree/master/12%20-%20Classification

In [7]:
# Required Python Machine learning Packages
import pandas as pd
import numpy as np
import operator

# To split the dataset into train and test datasets
from sklearn.model_selection import train_test_split

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

# To calculate the accuracy score of the model
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
def label_encode(df, columns):
    for col in columns:
        le = LabelEncoder()
        col_values_unique = list(df[col].unique())
        le_fitted = le.fit(col_values_unique)
 
        col_values = list(df[col].values)
        #le.classes_
        col_values_transformed = le.transform(col_values)
        df[col] = col_values_transformed

In [9]:
def classify(predictors, response, classifier = 'svm', kern='rbf', neighbors=3, kfolds=0, report=False, features_select=0):        
    # split X and y into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(predictors, response, test_size=0.20, random_state=1)

    
    if classifier == 'nb':
        cl = GaussianNB() # instantiate model
        msg = 'Naive Bayes'
    elif classifier == 'svm':
        cl = svm.SVC(kernel=kern) # instantiate model
        msg = 'SVM with ' + kern + ' kernel'
    elif classifier == 'knn':
        cl = KNeighborsRegressor(n_neighbors=neighbors)
        msg = 'KNN with k=' + str(neighbors)
    elif classifier == 'rf':
        # Instantiate model with  decision trees
        # Note that the number of decision trees is denoted
        # using the neighbors parameter, which is set to 3 by default
        cl = RandomForestRegressor(n_estimators = 1000, random_state = 42)
        msg = 'Random Forest with ' + str(neighbors) + ' decision trees'
    elif classifier == 'dtree':
        cl = DecisionTreeClassifier(min_samples_split=20, random_state=99)
        msg = 'Decision tree'
    elif classifier == 'logisticregression':
        cl = LogisticRegression()
        msg = 'Logistic Regression'
        
    model = cl.fit(X_train, y_train)
    y_pred_class = model.predict(X_test)
    print(msg + ' model accuracy score: ', metrics.accuracy_score(y_test, y_pred_class.round()))
    
    if kfolds > 0:
        # Perform k-fold cross validation
        scores = cross_val_score(model, predictors, response, cv=kfolds)
        print('Cross-validated score:', scores.mean())
    
    if report == True:
        print('Confusion matrix')
        print(metrics.confusion_matrix(y_test, y_pred_class)) # confusion matrix
        print('Classification report')
        target_names=['low','med','high']
        print(metrics.classification_report(y_test, y_pred_class, target_names=target_names))
        
        print('______________________________')
    
    if features_select > 0:
        test = SelectKBest(score_func=chi2, k=features_select)
        fit = test.fit(predictors, response)
        
        # summarize scores
        np.set_printoptions(precision=3)
        # print(fit.scores_)
        features_scores = {}
        for i in range(0, len(predictors.columns)):
            features_scores[predictors.columns[i]] = fit.scores_[i]
            
        # print(features_scores)
        sorted_features_scores = sorted(features_scores.items(), key=operator.itemgetter(1))
        print(sorted_features_scores[-5:])        

In [10]:
def decompose(predictors):
    # feature extraction
    pca = PCA(n_components=3)
    fit = pca.fit(predictors)
    
    # summarize components
    # print(fit.explained_variance_ratio_)
    # print(fit.components_)
    return pd.DataFrame(fit.components_)

In [11]:
df = pd.read_csv("311Trimmed.csv")
df.head()

Unnamed: 0,REQUEST_ID,CREATED_ON,REQUEST_TYPE,REQUEST_ORIGIN,STATUS,DEPARTMENT,NEIGHBORHOOD,COUNCIL_DISTRICT,WARD,TRACT,...,daily.temperatureMin,daily.temperatureMinTime,daily.time,daily.uvIndex,daily.uvIndexTime,daily.visibility,daily.windBearing,daily.windGust,daily.windGustTime,daily.windSpeed
0,205290,2017-12-31T23:17:00,Snow/Ice removal,Website,1,DPW - Street Maintenance,Marshall-Shadeland,1.0,27.0,42003271500,...,1.99,1514721600,1514696400,1,1514732400,9.09,264,5.32,1514696400,0.57
1,205289,2017-12-31T22:01:00,Snow/Ice removal,Report2Gov iOS,1,DPW - Street Maintenance,Stanton Heights,9.0,10.0,42003100500,...,1.57,1514721600,1514696400,1,1514732400,9.06,265,5.27,1514696400,0.51
2,205288,2017-12-31T21:55:00,Snow/Ice removal,Website,1,DPW - Street Maintenance,New Homestead,5.0,31.0,42003310300,...,1.76,1514721600,1514696400,1,1514732400,9.06,257,5.38,1514696400,0.63
3,205287,2017-12-31T18:23:00,Missed Pick Up,Website,1,DPW - Refuse,Mount Washington,2.0,19.0,42003191400,...,1.5,1514721600,1514696400,1,1514732400,9.09,262,5.35,1514696400,0.55
4,205286,2017-12-31T17:56:00,Snow/Ice removal,Report2Gov iOS,1,DPW - Street Maintenance,Stanton Heights,9.0,10.0,42003100500,...,1.57,1514721600,1514696400,1,1514732400,9.06,265,5.27,1514696400,0.52


In [14]:
df.fillna(0, inplace=True) #fill zero instances of weather
df = df.reset_index()

In [12]:
df.columns

Index(['REQUEST_ID', 'CREATED_ON', 'REQUEST_TYPE', 'REQUEST_ORIGIN', 'STATUS',
       'DEPARTMENT', 'NEIGHBORHOOD', 'COUNCIL_DISTRICT', 'WARD', 'TRACT',
       'PUBLIC_WORKS_DIVISION', 'PLI_DIVISION', 'POLICE_ZONE', 'FIRE_ZONE',
       'X', 'Y', 'GEO_ACCURACY', 'current.apparentTemperature',
       'current.cloudCover', 'current.dewPoint', 'current.humidity',
       'current.icon', 'current.precipIntensity', 'current.precipProbability',
       'current.precipType', 'current.pressure', 'current.summary',
       'current.temperature', 'current.time', 'current.uvIndex',
       'current.visibility', 'current.windBearing', 'current.windGust',
       'current.windSpeed', 'daily.apparentTemperatureHigh',
       'daily.apparentTemperatureHighTime', 'daily.apparentTemperatureLow',
       'daily.apparentTemperatureLowTime', 'daily.apparentTemperatureMax',
       'daily.apparentTemperatureMaxTime', 'daily.apparentTemperatureMin',
       'daily.apparentTemperatureMinTime', 'daily.cloudCover',
    

In [15]:
# define X and y
X = df[['current.apparentTemperature', 'current.cloudCover', 'current.dewPoint', 'current.humidity',
       'current.icon', 'current.precipIntensity', 'current.precipProbability',
       'current.precipType', 'current.pressure', 'current.summary',
       'current.temperature', 'current.uvIndex',
       'current.visibility', 'current.windBearing', 'current.windGust',
       'current.windSpeed', 'daily.apparentTemperatureHigh',
       'daily.apparentTemperatureLow', 'daily.apparentTemperatureMax',
       'daily.apparentTemperatureMin', 'daily.cloudCover',
       'daily.dewPoint', 'daily.humidity', 'daily.icon', 'daily.moonPhase',
       'daily.precipAccumulation', 'daily.precipIntensity', 'daily.precipIntensityMax', 
       'daily.precipProbability', 'daily.precipType', 'daily.pressure',
       'daily.summary', 'daily.temperatureHigh', 'daily.temperatureLow', 
       'daily.temperatureMax', 'daily.temperatureMin', 'daily.uvIndex', 'daily.visibility',
       'daily.windBearing', 'daily.windGust', 'daily.windSpeed']]
label_encode(X, X.columns.values)

# y = df[['NEIGHBORHOOD']]
# y = df['DEPARTMENT']
y = df[['REQUEST_TYPE']]  

label_encode(y, y.columns.values)
# y = y['REQUEST_TYPE']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [None]:
#classify(X, y, classifier='nb', kfolds=10, report=False, features_select=3)
# classify(X, y, classifier='svm', kern='rbf', kfolds=10, report=False, features_select=3)
#X_pca = decompose(X)
#classify(X_pca, y, classifier='svm', kern='rbf', kfolds=10, report=False, features_select=3)
#print(X_pca.shape)
# classify(X, y, classifier='svm', kern='linear', kfolds=10, report=True)
# classify(X, y, classifier='svm', kern='poly', kfolds=10, report=True)
# classify(X, y, classifier='knn', neighbors=3, kfolds=10)
# classify(X, y, classifier='knn', neighbors=5, kfolds=10)
# classify(X, y, classifier='knn', neighbors=7)
# classify(X, y, classifier='rf', neighbors=1000, kfolds=10)
# classify(X, y, classifier='rf', neighbors=2000)
# classify(X, y, classifier='dtree', kfolds=10)
classify(X, y, classifier='logisticregression', kfolds=10)

  ensure_2d=ensure_2d, allow_nd=allow_nd,
  " 0.22. Specify the multi_class option to silence "


Logistic Regression model accuracy score:  0.25063704005707876


  ensure_2d=ensure_2d, allow_nd=allow_nd,
  " 0.22. Specify the multi_class option to silence "
  ensure_2d=ensure_2d, allow_nd=allow_nd,
  " 0.22. Specify the multi_class option to silence "
  ensure_2d=ensure_2d, allow_nd=allow_nd,
  " 0.22. Specify the multi_class option to silence "


In [None]:
classify(X, y, classifier='nb', kfolds=10, report=False, features_select=3)

In [None]:
classify(X, y, classifier='dtree', kfolds=10)

In [None]:
classify(X, y, classifier='rf', neighbors=2000)

In [None]:
classify(X, y, classifier='rf', neighbors=1000, kfolds=10)

In [None]:
classify(X, y, classifier='knn', neighbors=7)

In [None]:
classify(X, y, classifier='knn', neighbors=5, kfolds=10)

In [None]:
classify(X, y, classifier='knn', neighbors=3, kfolds=10)

In [None]:
# classify(X, y, classifier='svm', kern='poly', kfolds=10, report=True)
# classify(X, y, classifier='svm', kern='linear', kfolds=10, report=True)

In [None]:
classify(X, y, classifier='svm', kern='rbf', kfolds=10, report=False, features_select=3)

In [None]:
X_pca = decompose(X)
classify(X_pca, y, classifier='svm', kern='rbf', kfolds=10, report=False, features_select=3)
print(X_pca.shape)