# Programming Exercises

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [6]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [7]:
df_size = len(df_train)
disasters_size = len(df_train[df_train.target == 1])
print(f"Training set size: {df_size} (Distaters: {disasters_size}, Other: {df_size-disasters_size})")

df_test_size = len(df_test)
print(f"Testing set size: {df_test_size}")

Training set size: 7613 (Distaters: 3271, Other: 4342)
Testing set size: 3263


## 1
### a
7613 Training Points (3271 Disasters), 3263 Testing Points

In [9]:
# ID is uselesss and Location may bee too noisy
df_train = df_train.drop(columns=["id", "location"])
df_test = df_test.drop(columns=["id", "location"])

In [10]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words as englishwords
from nltk.stem import WordNetLemmatizer
def tokenize(string, lower=True, punctuation=True, stops=True, lemmatize=True, check_english=True):
    stop_words = set(stopwords.words("english"))
    english_words = set(englishwords.words())
    words = word_tokenize(string)
    lzr = WordNetLemmatizer() 

    if lower:
        words = [w.lower() for w in words]
    if punctuation:
        words = [w for w in words if w.isalnum()]
    if stops:
        words = [w for w in words if w not in stop_words]
    if lemmatize:
        words = [lzr.lemmatize(w) for w in words]
    # Takes long to run, use with caution
    if check_english:
        words = [w for w in words if w in english_words]
    return words

In [11]:
from nltk.stem import WordNetLemmatizer
lzr = WordNetLemmatizer()
df_test.text = df_test.text.apply(lambda x: tokenize(x))
df_test.keywords = df_test.keyword.apply(lambda x: x.lower() if isinstance(x, str) else x)

In [12]:
df_test.to_csv("filtered_test.csv")

## Run from here

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df = pd.read_csv("filtered_train.csv", index_col=False)
df = df.rename(columns={"target": "is_disaster"})

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True, min_df=5)
sample = df.text.iloc[0]
X = vectorizer.fit_transform(df.text.values)
bag_of_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
df = pd.concat([df.drop(columns="Unnamed: 0"), bag_of_words], axis=1, join="inner")

In [15]:
X_train, X_dev, y_train, y_dev = train_test_split(df.drop(columns=["is_disaster", "keyword", "text"]), df.is_disaster, train_size=0.7, random_state=37)

In [16]:
from sklearn.metrics import classification_report

### Logistic Regression

In [362]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=37).fit(X_train, y_train)
print(classification_report(y_dev, clf.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82      1317
           1       0.77      0.69      0.73       967

    accuracy                           0.78      2284
   macro avg       0.78      0.77      0.77      2284
weighted avg       0.78      0.78      0.78      2284



### Linear SVM

In [363]:
from sklearn.svm import LinearSVC
lsvm = LinearSVC(random_state=37).fit(X_train, y_train)
print(classification_report(y_dev, lsvm.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.78      0.79      0.79      1317
           1       0.71      0.70      0.70       967

    accuracy                           0.75      2284
   macro avg       0.75      0.74      0.74      2284
weighted avg       0.75      0.75      0.75      2284



### Non-Linear SVM

In [364]:
from sklearn.svm import SVC
svm = SVC(random_state=37, C=1).fit(X_train, y_train)
print(classification_report(y_dev, svm.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84      1317
           1       0.85      0.64      0.73       967

    accuracy                           0.80      2284
   macro avg       0.81      0.78      0.79      2284
weighted avg       0.81      0.80      0.79      2284



### Naive Bayes

In [17]:
import numpy as np
from numba import njit

class NaiveBayes:
    # Bernoulli Naive Bayes Classifer using Pandas
    def fit(self, X: pd.DataFrame, y: pd.Series):
        self.classes = np.unique(y)
        self.means = X.groupby(y).mean().clip(1e-14, 1-1e-14) # psis
        self.priors = X.groupby(y).apply(lambda x: len(x)) / X.shape[0] # phis
        return self
    
    def predict(self, X):
        # Feature Length
        len_features = X.shape[0]

        # Calculate log values
        log_means = self.means.apply(np.log)
        log_means_minus = (1-self.means).apply(np.log)
        log_priors = self.priors.apply(np.log)

        # Containers
        conditional_probs = np.zeros((self.classes.size, len_features))

        # Calculate log probabilities for each class
        for i in range(self.classes.size):
            log_probability = X*log_means.iloc[i] + (1-X)*log_means_minus.iloc[i]
            log_sum = log_probability.sum(axis=1) + log_priors.loc[i]
            conditional_probs[i] = log_sum

        return conditional_probs.argmax(axis=0)

In [366]:
nb = NaiveBayes().fit(X_train, y_train)
print(classification_report(y_dev, nb.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.79      0.84      0.81      1317
           1       0.76      0.69      0.72       967

    accuracy                           0.77      2284
   macro avg       0.77      0.76      0.77      2284
weighted avg       0.77      0.77      0.77      2284



In [367]:
# Test SKLearn Implementation
from sklearn.naive_bayes import BernoulliNB
sk_nb = BernoulliNB().fit(X_train, y_train)
print(classification_report(y_dev, sk_nb.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83      1317
           1       0.79      0.69      0.74       967

    accuracy                           0.79      2284
   macro avg       0.79      0.78      0.78      2284
weighted avg       0.79      0.79      0.79      2284



In [368]:
df = pd.read_csv("filtered_train.csv", index_col=False)
df = df.rename(columns={"target": "is_disaster"})

vectorizer = CountVectorizer(binary=True, min_df=5, ngram_range=(1,2))
X = vectorizer.fit_transform(df.text.values)
bag_of_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
df = pd.concat([df.drop(columns="Unnamed: 0"), bag_of_words], axis=1, join="inner")

X_train, X_dev, y_train, y_dev = train_test_split(df.drop(columns=["is_disaster", "keyword", "text"]), df.is_disaster, train_size=0.7, random_state=37)

In [370]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=37).fit(X_train, y_train)
print(classification_report(y_dev, clf.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.79      0.86      0.82      1317
           1       0.78      0.68      0.73       967

    accuracy                           0.78      2284
   macro avg       0.78      0.77      0.77      2284
weighted avg       0.78      0.78      0.78      2284



In [371]:
from sklearn.svm import LinearSVC
lsvm = LinearSVC(random_state=37).fit(X_train, y_train)
print(classification_report(y_dev, lsvm.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.78      0.79      0.78      1317
           1       0.71      0.69      0.70       967

    accuracy                           0.75      2284
   macro avg       0.74      0.74      0.74      2284
weighted avg       0.75      0.75      0.75      2284



In [373]:
from sklearn.svm import SVC
svm = SVC(random_state=37, C=1).fit(X_train, y_train)
print(classification_report(y_dev, svm.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84      1317
           1       0.84      0.64      0.73       967

    accuracy                           0.80      2284
   macro avg       0.81      0.78      0.78      2284
weighted avg       0.81      0.80      0.79      2284



In [372]:
nb = NaiveBayes().fit(X_train, y_train)
print(classification_report(y_dev, nb.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.77      0.88      0.82      1317
           1       0.80      0.64      0.71       967

    accuracy                           0.78      2284
   macro avg       0.78      0.76      0.77      2284
weighted avg       0.78      0.78      0.77      2284



In [374]:
df = pd.read_csv("filtered_train.csv", index_col=False)
df = df.rename(columns={"target": "is_disaster"})

vectorizer = CountVectorizer(binary=True, min_df=5, ngram_range=(1,2))
X = vectorizer.fit_transform(df.text.values)
bag_of_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
df = pd.concat([df.drop(columns="Unnamed: 0"), bag_of_words], axis=1, join="inner")

In [386]:
pd.concat([df.drop(columns=["keyword", "text"]), pd.get_dummies(df.keyword, dummy_na=True, prefix="keyword")], axis=1)

Unnamed: 0,is_disaster,aba,aba woman,abandoned,abandoned aircraft,ablaze,able,absolutely,abuse,access,...,keyword_whirlwind,keyword_wild%20fires,keyword_wildfire,keyword_windstorm,keyword_wounded,keyword_wounds,keyword_wreck,keyword_wreckage,keyword_wrecked,keyword_nan
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7609,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7610,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7611,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [388]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=37).fit(X_train, y_train)
print(classification_report(y_dev, clf.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.79      0.86      0.82      1317
           1       0.78      0.68      0.73       967

    accuracy                           0.78      2284
   macro avg       0.78      0.77      0.77      2284
weighted avg       0.78      0.78      0.78      2284



In [389]:
from sklearn.svm import LinearSVC
lsvm = LinearSVC(random_state=37).fit(X_train, y_train)
print(classification_report(y_dev, lsvm.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.78      0.79      0.78      1317
           1       0.71      0.69      0.70       967

    accuracy                           0.75      2284
   macro avg       0.74      0.74      0.74      2284
weighted avg       0.75      0.75      0.75      2284



In [391]:
from sklearn.svm import SVC
svm = SVC(random_state=37, C=1).fit(X_train, y_train)
print(classification_report(y_dev, svm.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84      1317
           1       0.84      0.64      0.73       967

    accuracy                           0.80      2284
   macro avg       0.81      0.78      0.78      2284
weighted avg       0.81      0.80      0.79      2284



In [390]:
nb = NaiveBayes().fit(X_train, y_train)
print(classification_report(y_dev, nb.predict(X_dev))) 

              precision    recall  f1-score   support

           0       0.77      0.88      0.82      1317
           1       0.80      0.64      0.71       967

    accuracy                           0.78      2284
   macro avg       0.78      0.76      0.77      2284
weighted avg       0.78      0.78      0.77      2284



In [None]:
import optuna
seed = 37
def objective(trial):
    model = trial.suggest()q
    C = trial.suggest()
    kernel = trial.suggest()
    degree = trial.suggest()
    gamma = trial.suggest()
    if kernel == "poly" or kernel == "sigmoid":
        coef0 = trial.suggest()

In [28]:
df_train = pd.read_csv("filtered_train.csv", index_col=False)
df_test = pd.read_csv("filtered_test.csv", index_col=False)
df_train = df_train.rename(columns={"target": "is_disaster"})
df_test = df_test.rename(columns={"target": "is_disaster"})

In [35]:
all_data = pd.concat([df_train, df_test])
vectorizer = CountVectorizer(binary=True, min_df=5, ngram_range=(1,2))
X = vectorizer.fit_transform(all_data.text.values)
bag_of_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
all_data = pd.concat([all_data.drop(columns="Unnamed: 0"), bag_of_words], axis=1, join="inner")
all_data  = pd.concat([all_data.drop(columns=["keyword", "text"]), pd.get_dummies(all_data.keyword, dummy_na=True, prefix="keyword")], axis=1)

In [41]:
train = all_data[all_data.is_disaster.notnull()]
X_test = all_data[all_data.is_disaster.isnull()].drop(columns="is_disaster")

In [51]:
from sklearn.svm import SVC
svm = SVC(random_state=37, C=1).fit(train.drop(columns="is_disaster"), train.is_disaster)

In [53]:
predictions = pd.Series(svm.predict(X_test))

In [57]:
output = pd.DataFrame(data={"id": range(predictions.size), "target": predictions})

In [62]:
output.target = output.target.apply(int)

In [63]:
output.to_csv("submission.csv", index=False)