In [265]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import itertools
import time

In [190]:
# Import data
df = pd.read_csv("working_data/prepared_data.csv")
df.head()

Unnamed: 0,location,set,a_b,location_type,classification,binary_temp,binary_moisture,Rounded to Sub-Rounded,Sub-Rounded to Sub-Angular,Sub-Angular to Angular,...,Deep Troughs,Crescentic Gouges,Arc Shaped Steps,Linear Steps,Sharp Angular Features,Upturned Plates,V Shaped Cracks,Edge Rounding,Breakage Blocks,Abrasion Features
0,NOR,5,B,river,cold-wet,cold,wet,0,1,0,...,0,0,1,0,0,0,1,1,0,0
1,NOR,5,B,river,cold-wet,cold,wet,0,1,0,...,0,1,0,1,0,0,0,1,0,0
2,NOR,5,B,river,cold-wet,cold,wet,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,NOR,5,B,river,cold-wet,cold,wet,0,0,1,...,0,0,1,1,1,1,0,0,0,0
4,NOR,5,B,river,cold-wet,cold,wet,0,0,1,...,0,0,1,1,0,0,1,0,0,0


In [249]:
len(df)

1241

### Temperature binary modeling

In [191]:
# Create dataframe for binary temperature logistic regression
df_binary_temp = df.loc[df['location_type'] == "river"]

In [192]:
len(df_binary_temp)

1143

In [193]:
df_binary_temp = df_binary_temp.drop(columns=["location", "set", "a_b", "location_type", "classification", "binary_moisture"])

In [194]:
# Convert outcome to numerical
# 1 is cold
# 0 is hot
df_binary_temp["outcome"] = df.binary_temp.apply(lambda x: 1 if x == "cold" else 0)

In [195]:
columns = df_binary_temp.columns
columns

Index(['binary_temp', 'Rounded to Sub-Rounded', 'Sub-Rounded to Sub-Angular',
       'Sub-Angular to Angular', 'Low Relief', 'Medium Relief', 'High Relief',
       'Precipitation Features', 'Dissolution Etching', 'Fracture Faces',
       'Subparallel Linear Features', 'Conchoidal Fractures', 'Curved Grooves',
       'Straight Grooves', 'Deep Troughs', 'Crescentic Gouges',
       'Arc Shaped Steps', 'Linear Steps', 'Sharp Angular Features',
       'Upturned Plates', 'V Shaped Cracks', 'Edge Rounding',
       'Breakage Blocks', 'Abrasion Features', 'outcome'],
      dtype='object')

In [196]:
new_order = ['binary_temp', 'outcome', 'Rounded to Sub-Rounded', 'Sub Rounded to Sub Angular',
       'Sub Angular to Angular', 'Low Relief', 'Medium Relief', 'High Relief',
       'Precipitation Features', 'Dissolution Etching', 'Fracture Faces',
       'Subparallel Linear Features', 'Conchoidal Fractures', 'Curved Grooves',
       'Straight Grooves', 'Deep Troughs', 'Crescentic Gouges',
       'Arc Shaped Steps', 'Linear Steps', 'Sharp Angular Features',
       'Upturned Plates', 'V Shaped Cracks', 'Edge Rounding', 'Breakage Blocks',
       'Abrasion Features']

In [197]:
df_binary_temp = df_binary_temp.reindex(columns=new_order)

In [198]:
df_binary_temp = df_binary_temp.drop(columns=["binary_temp"])

In [199]:
df_binary_temp = df_binary_temp.drop(columns=['Rounded to Sub-Rounded', 'Sub Rounded to Sub Angular',
       'Sub Angular to Angular', 'Low Relief', 'Medium Relief', 'High Relief',])

In [200]:
df_binary_temp

Unnamed: 0,outcome,Precipitation Features,Dissolution Etching,Fracture Faces,Subparallel Linear Features,Conchoidal Fractures,Curved Grooves,Straight Grooves,Deep Troughs,Crescentic Gouges,Arc Shaped Steps,Linear Steps,Sharp Angular Features,Upturned Plates,V Shaped Cracks,Edge Rounding,Breakage Blocks,Abrasion Features
0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0
1,1,1,0,1,1,1,0,0,0,1,0,1,0,0,0,1,0,0
2,1,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0
3,1,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0
4,1,0,1,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1236,0,0,1,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0
1237,0,1,1,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0
1238,0,1,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0
1239,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0


In [201]:
# Create dataframes for regression model
y = df_binary_temp["outcome"]
X = df_binary_temp.drop(columns="outcome")

In [202]:
# Split data into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [203]:
# Evaluate shape of training data
X_train.shape

(857, 17)

In [204]:
# Evaluate shape of testing data
X_test.shape

(286, 17)

In [205]:
# Instantiate model
classifier = LogisticRegression(solver='lbfgs',max_iter=200,random_state=1)

In [206]:
# Run model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [207]:
# Create predictions based on model
y_pred = classifier.predict(X_test)

In [208]:
print(accuracy_score(y_test, y_pred))

0.6818181818181818


In [209]:
# create confusion matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[ 87  50]
 [ 41 108]]


In [210]:
# print report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.68      0.64      0.66       137
           1       0.68      0.72      0.70       149

    accuracy                           0.68       286
   macro avg       0.68      0.68      0.68       286
weighted avg       0.68      0.68      0.68       286



In [260]:
# Run logistic regression model with combinations of independent variables

# get list of features
features = list(df_binary_temp.columns)
features.remove('outcome')
print(features)

combinations = []

# append possible combinations 
for i in range(0, len(features)+1):
    for subset in itertools.combinations(features, i):
        combinations.append(subset)

print(len(combinations))

['Precipitation Features', 'Dissolution Etching', 'Fracture Faces', 'Subparallel Linear Features', 'Conchoidal Fractures', 'Curved Grooves', 'Straight Grooves', 'Deep Troughs', 'Crescentic Gouges', 'Arc Shaped Steps', 'Linear Steps', 'Sharp Angular Features', 'Upturned Plates', 'V Shaped Cracks', 'Edge Rounding', 'Breakage Blocks', 'Abrasion Features']
131072


In [306]:
results_dict = {}

In [308]:
count = 0
results_dict = {}
t0 = time.time()

for c in range(1, len(combinations)):
    # Append "outcome" to selected subset to ensure column is in datafame
    comb = list(combinations[c])
    comb.append("outcome")
    # create dataframe with selected subset of features
    logistic_df = df_binary_temp[comb]
    
    # Create dataframes for regression model
    y = logistic_df["outcome"]
    X = logistic_df.drop(columns="outcome")
    
    # Split data into testing and training
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
    
    # Instantiate model
    classifier = LogisticRegression(solver='lbfgs',max_iter=200,random_state=1)
    
    # Run model
    classifier.fit(X_train, y_train)
    
    # Create predictions based on model
    y_pred = classifier.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    
    # Set list to tuple to be set as dictionary key (lists can't be keys)
    # accuracy is set to the value
    results_dict[tuple(comb)] = acc
    count += 1
    if count%1000 == 0:
        print(count)
        t = time.time()
        print(f'{t - t0} seconds')

t = time.time()
print(f'Finished in {t-t0} seconds')

1000
11.38988184928894 seconds
2000
23.08493971824646 seconds
3000
35.0455527305603 seconds
4000
47.283421993255615 seconds
5000
59.57030463218689 seconds
6000
71.5883059501648 seconds
7000
83.63515377044678 seconds
8000
95.76886081695557 seconds
9000
108.09730982780457 seconds
10000
120.7847056388855 seconds
11000
133.71019983291626 seconds
12000
147.17140197753906 seconds
13000
162.19702577590942 seconds
14000
176.02314591407776 seconds
15000
188.69489002227783 seconds
16000
201.1761028766632 seconds
17000
213.74863696098328 seconds
18000
226.30219864845276 seconds
19000
239.0059118270874 seconds
20000
251.4839849472046 seconds
21000
263.99079990386963 seconds
22000
276.99781584739685 seconds
23000
290.45021390914917 seconds
24000
303.8917419910431 seconds
25000
317.1794717311859 seconds
26000
330.4947657585144 seconds
27000
343.8748459815979 seconds
28000
357.0459747314453 seconds
29000
370.3395838737488 seconds
30000
383.5162880420685 seconds
31000
396.6742389202118 seconds
32000
4

IndexError: list index out of range

In [309]:
len(results_dict)

131071

In [310]:
max(results_dict, key = results_dict.get) 

('Precipitation Features',
 'Dissolution Etching',
 'Fracture Faces',
 'Subparallel Linear Features',
 'Crescentic Gouges',
 'Upturned Plates',
 'V Shaped Cracks',
 'Breakage Blocks',
 'outcome')

In [313]:
results_dict[('Precipitation Features',
 'Dissolution Etching',
 'Fracture Faces',
 'Subparallel Linear Features',
 'Crescentic Gouges',
 'Upturned Plates',
 'V Shaped Cracks',
 'Breakage Blocks',
 'outcome')]

0.7132867132867133

In [211]:
# Run same dataset with SVM: support vector machines
model = SVC(kernel='linear')

In [212]:
# Fit model
model.fit(X_train, y_train)

SVC(kernel='linear')

In [213]:
# Run model
y_pred = model.predict(X_test)

In [214]:
# Get accurary score
accuracy_score(y_test, y_pred)

0.6748251748251748

In [215]:
confusion_matrix(y_test, y_pred)

array([[ 89,  48],
       [ 45, 104]])

In [216]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.65      0.66       137
           1       0.68      0.70      0.69       149

    accuracy                           0.67       286
   macro avg       0.67      0.67      0.67       286
weighted avg       0.67      0.67      0.67       286



In [217]:
# Run same set with decision tree
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train, y_train)

In [218]:
# Making predictions using the testing data.
predictions = model.predict(X_test)

In [219]:
accuracy_score(y_test, predictions)

0.6013986013986014

In [220]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [221]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [222]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

In [223]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.6608391608391608


### Moisture binary modeling

In [224]:
# Create dataframe for binary temperature logistic regression
df_binary_moisture = df.loc[df['location_type'] == "river"]

In [225]:
df_binary_moisture = df_binary_moisture.drop(columns=["location", "set", "a_b", "location_type", 
                                                      "classification", "binary_temp"])

In [226]:
# Convert outcome to numerical
# 1 is wet
# 0 is dry
df_binary_moisture["outcome"] = df.binary_moisture.apply(lambda x: 1 if x == "wet" else 0)

In [230]:
df_binary_moisture.columns

Index(['binary_moisture', 'Rounded to Sub-Rounded',
       'Sub-Rounded to Sub-Angular', 'Sub-Angular to Angular', 'Low Relief',
       'Medium Relief', 'High Relief', 'Precipitation Features',
       'Dissolution Etching', 'Fracture Faces', 'Subparallel Linear Features',
       'Conchoidal Fractures', 'Curved Grooves', 'Straight Grooves',
       'Deep Troughs', 'Crescentic Gouges', 'Arc Shaped Steps', 'Linear Steps',
       'Sharp Angular Features', 'Upturned Plates', 'V Shaped Cracks',
       'Edge Rounding', 'Breakage Blocks', 'Abrasion Features', 'outcome'],
      dtype='object')

In [231]:
df_binary_moisture = df_binary_moisture.drop(columns=['binary_moisture', 'Rounded to Sub-Rounded',
       'Sub-Rounded to Sub-Angular', 'Sub-Angular to Angular', 'Low Relief',
       'Medium Relief', 'High Relief'])

In [232]:
# Create dataframes for regression model
y = df_binary_moisture["outcome"]
X = df_binary_moisture.drop(columns="outcome")

In [233]:
# Split data into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [234]:
# Instantiate model
classifier = LogisticRegression(solver='lbfgs',max_iter=200,random_state=1)

In [235]:
# Run model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [236]:
# Create predictions based on model
y_pred = classifier.predict(X_test)

In [237]:
print(accuracy_score(y_test, y_pred))

0.6433566433566433


In [314]:
# Run logistic regression model with combinations of independent variables

# get list of features
features = list(df_binary_moisture.columns)
features.remove('outcome')
print(features)

combinations = []

# append possible combinations 
for i in range(0, len(features)+1):
    for subset in itertools.combinations(features, i):
        combinations.append(subset)

print(len(combinations))

['Precipitation Features', 'Dissolution Etching', 'Fracture Faces', 'Subparallel Linear Features', 'Conchoidal Fractures', 'Curved Grooves', 'Straight Grooves', 'Deep Troughs', 'Crescentic Gouges', 'Arc Shaped Steps', 'Linear Steps', 'Sharp Angular Features', 'Upturned Plates', 'V Shaped Cracks', 'Edge Rounding', 'Breakage Blocks', 'Abrasion Features']
131072


In [317]:
count = 0
results_moisture = {}
t0 = time.time()

for c in range(1, len(combinations)):
    # Append "outcome" to selected subset to ensure column is in datafame
    comb = list(combinations[c])
    comb.append("outcome")
    # create dataframe with selected subset of features
    logistic_df = df_binary_moisture[comb]
    
    # Create dataframes for regression model
    y = logistic_df["outcome"]
    X = logistic_df.drop(columns="outcome")
    
    # Split data into testing and training
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
    
    # Instantiate model
    classifier = LogisticRegression(solver='lbfgs',max_iter=200,random_state=1)
    
    # Run model
    classifier.fit(X_train, y_train)
    
    # Create predictions based on model
    y_pred = classifier.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    
    # Set list to tuple to be set as dictionary key (lists can't be keys)
    # accuracy is set to the value
    results_moisture[tuple(comb)] = acc
    count += 1
    if count%1000 == 0:
        print(count)
        t = time.time()
        print(f'{t - t0} seconds')

t = time.time()
print(f'Finished in {t-t0} seconds')

1000
24.52354407310486 seconds
2000
52.320719718933105 seconds
3000
77.8368330001831 seconds
4000
105.23462915420532 seconds
5000
133.94321584701538 seconds
6000
161.82600378990173 seconds
7000
187.75633573532104 seconds
8000
215.62450289726257 seconds
9000
243.93748712539673 seconds
10000
273.1640729904175 seconds
11000
305.14990305900574 seconds
12000
333.5668520927429 seconds
13000
366.11247205734253 seconds
14000
396.076388835907 seconds
15000
427.57798075675964 seconds
16000
460.0297899246216 seconds
17000
491.4556610584259 seconds
18000
520.7501528263092 seconds
19000
558.2626600265503 seconds
20000
596.0960130691528 seconds
21000
631.468160867691 seconds
22000
669.723207950592 seconds
23000
708.7765209674835 seconds
24000
747.0862679481506 seconds
25000
782.9049189090729 seconds
26000
820.2516930103302 seconds
27000
858.7224819660187 seconds
28000
894.7757868766785 seconds
29000
926.1332018375397 seconds
30000
948.5614597797394 seconds
31000
982.0299048423767 seconds
32000
1014.

In [238]:
# Run same dataset with SVM: support vector machines
model = SVC(kernel='linear')

In [239]:
# Fit model
model.fit(X_train, y_train)

SVC(kernel='linear')

In [240]:
# Run model
y_pred = model.predict(X_test)

In [241]:
# Get accurary score
accuracy_score(y_test, y_pred)

0.6188811188811189

In [242]:
# Run same set with decision tree
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train, y_train)

In [243]:
# Making predictions using the testing data.
predictions = model.predict(X_test)

In [244]:
accuracy_score(y_test, predictions)

0.6048951048951049

In [245]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [246]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [247]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

In [248]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.5909090909090909
