<a href="https://colab.research.google.com/github/farheenfathimaa/NLP-with-Disaster-Tweets/blob/main/Natural_Language_Processing_with_Disaster_Tweets_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# mounting drive
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
# Unzip the uploaded data into Google Drive
#!unzip "/content/drive/MyDrive/nlp-getting-started.zip" -d "/content/drive/MyDrive/nlp-tweets"

# Natural Language Processing with Disaster Tweets

This notebook looks into various Python-based machine learning and data science libraries in an attempt to build a machine learning model capable of predicting whether a given tweet is about a real disaster or not. If so, predict a 1. If not, predict a 0.

We're going to take the following approach:

1. Problem definition
2. Data
3. Evaluation
4. Features
5. Modelling
6. Experimentation

It is available on Kaggle. [Link](https://www.kaggle.com/competitions/nlp-getting-started/overview)

## Tools

In [None]:
# Import all the tools we need

# Regular EDA (exploratory data analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay

from sklearn.preprocessing import OneHotEncoder #Encode categorical features as a one-hot numeric array.
from sklearn.compose import ColumnTransformer # transform an entire column

In [None]:
# viewing the data
train_data = pd.read_csv("/content/drive/MyDrive/nlp-tweets/train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/nlp-tweets/test.csv")
train_data, test_data

(         id keyword location  \
 0         1     NaN      NaN   
 1         4     NaN      NaN   
 2         5     NaN      NaN   
 3         6     NaN      NaN   
 4         7     NaN      NaN   
 ...     ...     ...      ...   
 7608  10869     NaN      NaN   
 7609  10870     NaN      NaN   
 7610  10871     NaN      NaN   
 7611  10872     NaN      NaN   
 7612  10873     NaN      NaN   
 
                                                    text  target  
 0     Our Deeds are the Reason of this #earthquake M...       1  
 1                Forest fire near La Ronge Sask. Canada       1  
 2     All residents asked to 'shelter in place' are ...       1  
 3     13,000 people receive #wildfires evacuation or...       1  
 4     Just got sent this photo from Ruby #Alaska as ...       1  
 ...                                                 ...     ...  
 7608  Two giant cranes holding a bridge collapse int...       1  
 7609  @aria_ahrary @TheTawniest The out of control w...       1  

In [None]:
def transform_data(df):
  """Transforms the data on which the model can be made"""
  df["keyword"].fillna("random", inplace=True)
  df["location"].fillna("earth", inplace=True)

  # Convert the sparse matrix to a dense array
  df = df.tonumpy()

  # Convert sparse matrix to COO format
  df_coo = df.tocoo()

  # Iterate over rows and columns
  for row, col, value in zip(df_coo.row, df_coo.col, df_coo.data):
      # Do something with the row, column, and value
      print(f"Row: {row}, Column: {col}, Value: {value}")

  # This will turn all of the string value into category values
  for label, content in df.items():
      if pd.api.types.is_object_dtype(content):
        df[label] = content.astype("category").cat.as_ordered()

  # turn the categories into numbers
  categorical_features=["keyword","location","text"]
  one_hot=OneHotEncoder()
  transformer=ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                                  remainder="passthrough")
  df=transformer.fit_transform(df) #fit_transform is a convenience method that combines the fitting and transformation steps into one call.
  return df

  def transform_data(df):
    """This function will turn all of the string value into category values"""
    for col in df.columns:
        if pd.api.types.is_object_dtype(df[col]):
            df[col] = df[col].astype("category").cat.as_ordered()
    return df

In [None]:
train_data = transform_data(train_data)
train_data

IndexError: Index dimension must be 1 or 2

In [None]:
test_data = transform_data(test_data)
test_data

UnboundLocalError: local variable 'train_data' referenced before assignment

##Data is ready now we can make a model

In [None]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Puts models in a dictionary
models = {"Logistic Regression": LogisticRegression(),
          "KNN": KNeighborsClassifier(),
          "Random Forest": RandomForestClassifier()}

# Create a function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learnig models.
    models : a dict of different Scikit-Learn machine learning models
    X_train : training data (no labels)
    X_test : testing data (no labels)
    y_train : training labels
    y_test : test labels
    """
    # Set random seed
    np.random.seed(42)
    # Make a dictionary to keep the model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train,y_train)
        # Evaluate the model and append it's score to the model_scores
        model_scores[name]=model.score(X_test, y_test)
    return model_scores

In [None]:
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index=["accuracy"])
model_compare.T.plot.bar()

In [None]:
# Create a hyperparameter grid for LogisticRegression
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Create a hyperparameter grid for RandomForestClassifier
rf_grid = {"n_estimators": np.arange(10, 1000, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1,20,2)}

In [None]:
# Tune LogisticRegression

np.random.seed(42)

# Setup random hyperparameter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)

# Fit random hyperparameter search model for LogisticRegression
rs_log_reg.fit(X_train, y_train)

In [None]:
rs_log_reg.best_params_

In [None]:
rs_log_reg.score(X_test, y_test)

In [None]:
# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for RandomForestClassifier()
rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions=rf_grid,
                           cv=5,
                           n_iter=20,
                           verbose=True)

# Fit random hyperparameter search model for RandomForestClassifier()
rs_rf.fit(X_train, y_train)

In [None]:
rs_rf.best_params_

In [None]:
rs_rf.score(X_test, y_test)

In [None]:
# Different hyperparameters for our LogisticRegression model
log_reg_grid = {"C": np.logspace(-4, 4, 30),
                "solver": ["liblinear"]}

#  Setup grid hyperparameter search for LogisticRegression
gs_log_reg = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid,
                          cv=5,
                          verbose=2)

# Fit hyperparameter search for logisticRegression
gs_log_reg.fit(X_train, y_train)

In [None]:
# Check the best hyperparameters
gs_log_reg.best_params_

In [None]:
# Evaluate the hyperparameter search for LogisticRegression
gs_log_reg.score(X_test, y_test)

## Evaluating our tuned machine learing classifier, beyond accuracy
* ROC curve and AUC score
* Confusion matrix
* Classification report
* Precision
* Recall
* F1-score

In [None]:
# Make predictions with tuned model
y_preds = gs_log_reg.predict(X_test)

In [None]:
y_preds

In [None]:
y_test

In [None]:
# Plot ROC curve and calculate AUC metric
RocCurveDisplay.from_estimator(gs_log_reg, X_test, y_test);

In [None]:
sns.set(font_scale=1.5)

def plot_conf_mat(y_test, y_preds):
    """
    Plots a nice looking confusion matrix using Seaborn's heatmap()
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                     annot=True,
                     cbar=False)
    plt.xlabel("Predicted label")
    plt.ylabel("True label")

plot_conf_mat(y_test, y_preds)

In [None]:
print(classification_report(y_test, y_preds))

### Calculate evaluation metrics using cross-validation

In [None]:
gs_log_reg.best_params_

In [None]:
# new model with best params
clf = LogisticRegression(C = 1.3738237958832638,
                         solver = "liblinear")

In [None]:
# Cross-validated f1-score
cv_f1 = cross_val_score(clf,
                         transformed_X,
                         y,
                         cv=5,
                         scoring="f1")
cv_f1 = np.mean(cv_f1)
cv_f1

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.coef_

In [None]:
# Match coef's of features to columns
feature_dict = dict(zip(train_data.columns, list(clf.coef_[0])))
feature_dict