In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
#Dataset preparation
import json
import pandas as pd

path = '/content/drive/MyDrive/flood_project/Training_dataset/'
testing_path = '/content/drive/MyDrive/flood_project/test_set/'

#Load the training dataset using metadata and label data.
train_df = pd.read_csv(path + 'devset_images_gt.csv')
train_df.columns=(['image_id', 'label'])
train_df["image_id"] = train_df["image_id"].astype(str)
train_df = train_df[train_df.columns[[0, 1]]]

json_filename = path + "devset_images_metadata.json"
with open(json_filename) as json_file:
    data = json.load(json_file)

required_columns = ['description', 'user_tags', 'image_id', 'title']
json_df = pd.DataFrame(data['images'], columns = required_columns)
print(json_df.head())
#merge label and json dataframes
train_df = pd.merge(train_df, json_df, how='inner')
train_df.head()


#Load the testing dataset using metadata and label data.
test_df = pd.read_csv(testing_path + 'testset_images_gt.csv')
test_df.columns=(['image_id', 'label'])
test_df["image_id"] = test_df["image_id"].astype(str)
test_df = test_df[test_df.columns[[0, 1]]]

json_filename = testing_path + "testset_images_metadata.json"
with open(json_filename) as json_file:
    data = json.load(json_file)

required_columns = ['description', 'user_tags', 'image_id', 'title']
json_df = pd.DataFrame(data['images'], columns = required_columns)
#merge label and json dataframes
test_df = pd.merge(test_df, json_df, how='inner')
test_df.head()

                                         description  \
0                                               None   
1                                               None   
2  After the flood, the boarded up stores bear up...   
3                                               None   
4                                               None   

                                           user_tags    image_id  \
0                 [2009 road trip, obrero road trip]  3519864665   
1   [daulatabad, daulatabad fort, ellora, road trip]  4896119055   
2  [cedarrapids, createsouthroadtrip2009, disaste...  3468473862   
3          [cork, enchente, flood, ireland, irlanda]  4120853942   
4  [athens georgia, brown, current, flood, mud, r...  4436083254   

                    title  
0         Biltmore Estate  
1             Chand Minar  
2      Uplifting Graffiti  
3                DSCF6487  
4  Oconoe river - flooded  


Unnamed: 0,image_id,label,description,user_tags,title
0,347783223,0,The site of former mass graves. The graves use...,"[ cambodia, history, genocide]",Killing fields
1,6310531921,1,The rains had hit hard in Siem Reap and the pa...,[],Cambodia - flooded path to temple 2
2,8329435841,1,"War Eagle Mill, located along the War Eagle Cr...","[arkansas, creek, eureka springs, gristmill, h...",War Eagle Mill
3,3330434964,0,Some guy is keeping some buffalo down the road...,"[bogue chitto, buffalo, cannon, eos, farm, hom...",20090131_7286
4,2862785695,0,"I think this was a restaurant building, but I'...","[abandoned, amusement park, boarded up, canon ...",building


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [5]:
train_df = train_df.drop(columns=['user_tags','image_id'])
test_df = test_df.drop(columns=['user_tags','image_id'])
train_df = train_df.fillna(" ")
test_df = test_df.fillna(" ")

In [6]:
X_train = train_df[['description', 'title']]
y_train = train_df['label'].tolist()
X_test = test_df[['description', 'title']]
y_test = test_df['label'].tolist()

In [7]:
#Generating the embeddings for description and title
sentence_embeddings_description_train = model.encode(X_train.loc[:,'description'].tolist())
sentence_embeddings_title_train = model.encode(X_train.loc[:,'title'].tolist())


In [8]:
sentence_embeddings_description_test = model.encode(X_test.loc[:,'description'].to_list())
sentence_embeddings_title_test = model.encode(X_test.loc[:,'title'].to_list())

In [9]:
#Concatenating the embeddings
X_train = []
for i in range(len(sentence_embeddings_description_train)):
  X_train.append(sentence_embeddings_description_train[i].tolist() + sentence_embeddings_title_train[i].tolist())
X_test = []
for i in range(len(sentence_embeddings_description_test)):
  X_test.append(sentence_embeddings_description_test[i].tolist() + sentence_embeddings_title_test[i].tolist())

In [None]:
#Model building and training
from sklearn.svm import SVC
svm = SVC(kernel='rbf')

svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
#Test set evaluation metrics
from sklearn.metrics import classification_report

#predict response using SVM
svm_y_pred = svm.predict(X_test)

# calculate report for svm model
svm_report = classification_report(y_test, svm_y_pred, target_names=['not flooded', 'flooded'])

print('SVM Model classification report is: \n', svm_report)

SVM Model classification report is: 
               precision    recall  f1-score   support

 not flooded       0.80      0.77      0.78       839
     flooded       0.62      0.65      0.64       480

    accuracy                           0.73      1319
   macro avg       0.71      0.71      0.71      1319
weighted avg       0.73      0.73      0.73      1319



In [None]:
#Fine tuning the model using GridSearch
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [2, 1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'linear']}
 
svm_grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
svm_grid.fit(X_train, y_train)

# print how our model looks after hyper-parameter tuning
print(svm_grid.best_estimator_)

In [10]:
#Fine-tuned Model
# C=1, gamma=0.001, kernel=rbf, score=0.795, total=  37.6s
from sklearn.svm import SVC
svm = SVC(kernel='rbf', C=1, gamma=0.001)

svm.fit(X_train, y_train)

SVC(C=1, gamma=0.001)

In [11]:
#Evaluation of the fine-tuned model
from sklearn.metrics import classification_report

#predict response using SVM
svm_y_pred = svm.predict(X_test)

# calculate report for svm model
svm_report = classification_report(y_test, svm_y_pred, target_names=['not flooded', 'flooded'])

print('SVM Model classification report is: \n', svm_report)

SVM Model classification report is: 
               precision    recall  f1-score   support

 not flooded       0.79      0.91      0.84       839
     flooded       0.78      0.57      0.66       480

    accuracy                           0.79      1319
   macro avg       0.78      0.74      0.75      1319
weighted avg       0.79      0.79      0.78      1319

