In [135]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier


In [7]:
# Import data
df = pd.read_csv("working_data/prepared_data.csv")
df.head()

Unnamed: 0,location,set,a_b,location_type,classification,binary_temp,binary_moisture,Rounded to Sub-Rounded,Sub Rounded to Sub Angular,Sub Angular to Angular,...,Deep Troughs,Crescentic Gouges,Arc Shaped Steps,Linear Steps,Sharp Angular Features,Upturned Plates,V Shaped,Edge Rounding,Breakage Blocks,Abrasion Features
0,ELVA,5,B,river,cold-wet,cold,wet,0,1,0,...,0,0,1,0,0,0,1,1,0,0
1,ELVA,5,B,river,cold-wet,cold,wet,0,1,0,...,0,1,0,1,0,0,0,1,0,0
2,ELVA,5,B,river,cold-wet,cold,wet,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,ELVA,5,B,river,cold-wet,cold,wet,0,0,1,...,0,0,1,1,1,1,0,0,0,0
4,ELVA,5,B,river,cold-wet,cold,wet,0,0,1,...,0,0,1,1,0,0,1,0,0,0


### Temperature binary modeling

In [141]:
# Create dataframe for binary temperature logistic regression
df_binary_temp = df.loc[df['location_type'] == "river"]

In [142]:
len(df_binary_temp)

1143

In [143]:
df_binary_temp = df_binary_temp.drop(columns=["location", "set", "a_b", "location_type", "classification", "binary_moisture"])

In [144]:
# Convert outcome to numerical
# 1 is cold
# 0 is hot
df_binary_temp["outcome"] = df.binary_temp.apply(lambda x: 1 if x == "cold" else 0)

In [145]:
columns = df_binary_temp.columns
columns

Index(['binary_temp', 'Rounded to Sub-Rounded', 'Sub Rounded to Sub Angular',
       'Sub Angular to Angular', 'Low Relief', 'Medium Relief', 'High Relief',
       'Precipitation Features', 'Dissolution Etching', 'Fracture Faces',
       'Subparallel Linear Features', 'Conchoidal Fractures', 'Curved Grooves',
       'Straight Grooves', 'Deep Troughs', 'Crescentic Gouges',
       'Arc Shaped Steps', 'Linear Steps', 'Sharp Angular Features',
       'Upturned Plates', 'V Shaped', 'Edge Rounding', 'Breakage Blocks',
       'Abrasion Features', 'outcome'],
      dtype='object')

In [146]:
new_order = ['binary_temp', 'outcome', 'Rounded to Sub-Rounded', 'Sub Rounded to Sub Angular',
       'Sub Angular to Angular', 'Low Relief', 'Medium Relief', 'High Relief',
       'Precipitation Features', 'Dissolution Etching', 'Fracture Faces',
       'Subparallel Linear Features', 'Conchoidal Fractures', 'Curved Grooves',
       'Straight Grooves', 'Deep Troughs', 'Crescentic Gouges',
       'Arc Shaped Steps', 'Linear Steps', 'Sharp Angular Features',
       'Upturned Plates', 'V Shaped', 'Edge Rounding', 'Breakage Blocks',
       'Abrasion Features']

In [147]:
df_binary_temp = df_binary_temp.reindex(columns=new_order)

In [148]:
df_binary_temp = df_binary_temp.drop(columns=["binary_temp"])

In [149]:
df_binary_temp

Unnamed: 0,outcome,Rounded to Sub-Rounded,Sub Rounded to Sub Angular,Sub Angular to Angular,Low Relief,Medium Relief,High Relief,Precipitation Features,Dissolution Etching,Fracture Faces,...,Deep Troughs,Crescentic Gouges,Arc Shaped Steps,Linear Steps,Sharp Angular Features,Upturned Plates,V Shaped,Edge Rounding,Breakage Blocks,Abrasion Features
0,1,0,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,1,1,0,0
1,1,0,1,0,0,1,0,1,0,1,...,0,1,0,1,0,0,0,1,0,0
2,1,0,0,1,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
3,1,0,0,1,0,1,0,0,1,1,...,0,0,1,1,1,1,0,0,0,0
4,1,0,0,1,0,1,0,0,1,0,...,0,0,1,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1236,0,0,0,1,0,1,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
1237,0,0,1,0,0,1,0,1,1,0,...,0,0,1,1,0,0,0,0,0,0
1238,0,0,0,1,0,1,0,1,1,0,...,0,0,1,0,0,1,0,1,0,0
1239,0,0,1,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0


In [150]:
# Create dataframes for regression model
y = df_binary_temp["outcome"]
X = df_binary_temp.drop(columns="outcome")

In [151]:
# Split data into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [152]:
# Evaluate shape of training data
X_train.shape

(857, 23)

In [153]:
# Evaluate shape of testing data
X_test.shape

(286, 23)

In [154]:
# Instantiate model
classifier = LogisticRegression(solver='lbfgs',max_iter=200,random_state=1)

In [155]:
# Run model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [156]:
# Create predictions based on model
y_pred = classifier.predict(X_test)

In [157]:
print(accuracy_score(y_test, y_pred))

0.7307692307692307


In [158]:
# create confusion matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[ 99  38]
 [ 39 110]]


In [159]:
# print report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.72      0.72      0.72       137
           1       0.74      0.74      0.74       149

    accuracy                           0.73       286
   macro avg       0.73      0.73      0.73       286
weighted avg       0.73      0.73      0.73       286



In [160]:
# Run same dataset with SVM: support vector machines
model = SVC(kernel='linear')

In [161]:
# Fit model
model.fit(X_train, y_train)

SVC(kernel='linear')

In [162]:
# Run model
y_pred = model.predict(X_test)

In [163]:
# Get accurary score
accuracy_score(y_test, y_pred)

0.7202797202797203

In [164]:
confusion_matrix(y_test, y_pred)

array([[100,  37],
       [ 43, 106]])

In [165]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.73      0.71       137
           1       0.74      0.71      0.73       149

    accuracy                           0.72       286
   macro avg       0.72      0.72      0.72       286
weighted avg       0.72      0.72      0.72       286



In [166]:
# Run same set with decision tree
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train, y_train)

In [167]:
# Making predictions using the testing data.
predictions = model.predict(X_test)

In [168]:
accuracy_score(y_test, predictions)

0.6013986013986014

In [169]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [170]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [171]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

In [172]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.6573426573426573


### Moisture binary modeling

In [111]:
# Create dataframe for binary temperature logistic regression
df_binary_moisture = df.loc[df['location_type'] == "river"]

In [112]:
df_binary_moisture = df_binary_moisture.drop(columns=["location", "set", "a_b", "location_type", 
                                                      "classification", "binary_temp"])

In [113]:
# Convert outcome to numerical
# 1 is wet
# 0 is dry
df_binary_moisture["outcome"] = df.binary_moisture.apply(lambda x: 1 if x == "wet" else 0)

In [116]:
df_binary_moisture

Unnamed: 0,binary_moisture,Rounded to Sub-Rounded,Sub Rounded to Sub Angular,Sub Angular to Angular,Low Relief,Medium Relief,High Relief,Precipitation Features,Dissolution Etching,Fracture Faces,...,Crescentic Gouges,Arc Shaped Steps,Linear Steps,Sharp Angular Features,Upturned Plates,V Shaped,Edge Rounding,Breakage Blocks,Abrasion Features,outcome
0,wet,0,1,0,0,1,0,0,1,0,...,0,1,0,0,0,1,1,0,0,1
1,wet,0,1,0,0,1,0,1,0,1,...,1,0,1,0,0,0,1,0,0,1
2,wet,0,0,1,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
3,wet,0,0,1,0,1,0,0,1,1,...,0,1,1,1,1,0,0,0,0,1
4,wet,0,0,1,0,1,0,0,1,0,...,0,1,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1236,wet,0,0,1,0,1,0,0,1,0,...,0,1,0,1,0,0,0,0,0,1
1237,wet,0,1,0,0,1,0,1,1,0,...,0,1,1,0,0,0,0,0,0,1
1238,wet,0,0,1,0,1,0,1,1,0,...,0,1,0,0,1,0,1,0,0,1
1239,wet,0,1,0,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1


In [117]:
df_binary_moisture = df_binary_moisture.drop(columns=["binary_moisture"])

In [122]:
# Create dataframes for regression model
y = df_binary_moisture["outcome"]
X = df_binary_moisture.drop(columns="outcome")

In [123]:
# Split data into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [124]:
# Instantiate model
classifier = LogisticRegression(solver='lbfgs',max_iter=200,random_state=1)

In [125]:
# Run model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [126]:
# Create predictions based on model
y_pred = classifier.predict(X_test)

In [127]:
print(accuracy_score(y_test, y_pred))

0.6293706293706294


In [128]:
# Run same dataset with SVM: support vector machines
model = SVC(kernel='linear')

In [129]:
# Fit model
model.fit(X_train, y_train)

SVC(kernel='linear')

In [130]:
# Run model
y_pred = model.predict(X_test)

In [131]:
# Get accurary score
accuracy_score(y_test, y_pred)

0.6223776223776224

In [132]:
# Run same set with decision tree
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train, y_train)

In [133]:
# Making predictions using the testing data.
predictions = model.predict(X_test)

In [134]:
accuracy_score(y_test, predictions)

0.6013986013986014

In [136]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [137]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [138]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

In [140]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.6293706293706294
