In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
# Read in the data
disaster = pd.read_csv('~/meds/eds-232/EDS232-discussion/data/disaster.csv')

In [34]:
# Cleaning text data
# using r means that we're not using \ as anything but what it is
def preprocess(text):
    text = text.lower() # lowercase
    text=  text.strip() # removing leading/trailing spaces
    text=  re.sub(r'<.*?>','', text) # remove html syntax 
    text = re.sub(r'[^\w\s]','',text)  # remove punctuation
    text = re.sub(r'\[[0-9]*\]',' ',text) # remove reference numbers (numbers in brackets)
    text = re.sub(r'\d',' ',text)  # remove digits
    text = re.sub(r'\s+', ' ', text) # collapsing multiple spaces into a single space
    return text

In [35]:
# apply string cleaning to text variable
disaster['clean_text'] = disaster['text'].apply(preprocess)

In [36]:
disaster.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


In [37]:
# proof that Tfidf vectorizer excludes stopwords

stop_words = ["On March 5th, I will crush my capstone presentation with my awesome team."]

vectorizer_english = TfidfVectorizer(stop_words = "english")

vectorizer_english.fit_transform(stop_words)

print("Remaining words:")
print(vectorizer_english.get_feature_names_out())

Remaining words:
['5th' 'awesome' 'capstone' 'crush' 'march' 'presentation' 'team']


In [38]:
# split into train and test

X_train, X_test, y_train, y_test = train_test_split(disaster['clean_text'], disaster['target'], test_size = 0.3, random_state = 42)

In [41]:
# vectorize words

tfidf_vectorizer = TfidfVectorizer(stop_words = "english")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.fit(X_test)

In [42]:
# initialize a logistic regression model and fit to vectorized training data

lr_model = LogisticRegression(random_state = 42)
lr_model.fit(X_train_tfidf, y_train)
y_pred = lr_model.predict(X_test_tfidf)

ValueError: Expected 2D array, got scalar array instead:
array=TfidfVectorizer(stop_words='english').
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
# calculate LR accuracy

accuracy = accuracy_score (y_test, y_pred)
print(f"Accuracy: {accuracy}")


# create confusion matrix
cm = confusionmatrix(y_test, y_pred)
plt.figure(figsize = (8,6))
sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'GnBu',
           xticklabels = ["No Disaster", "Disaster"],
           yticklabels = ["No Disaster", "Disaster"])
plt.title('logistic regression model performance')
plt.xlabel('predicted')
plt.ylabel('True')
plt.show()

In [46]:
# test model with new data

new_text = [
    "BREAKING: Massive earthquake hits the coast",
    "I love watching disasters movies on netflix",
    "thousands evacuated as hurricane approaches",
    "TTHHEEEESSSEE is a disasterrrr",
    "It's windy",
    "The palisades fire has damaged over 7,000 structures",
    "I broke my tooth on a jawbreaker"
]


# preprocess new phrases
cleaned_new_text = [preprocess(text) for text in new_text]


# transform using TF-IDF vectorizer
new_features = tfidf_vectorizer.transform(cleaned_new_text)

# make predictions
predictions = lr_model.predict(new_features)

# check outcomes

for text, pred in zip(new_text, predictions):
    print(f"Text: {text}")
    print(f"Predictions: {'real disaster' if pred == 1 else 'not a real disaster'}\n")
          

ValueError: X has 9807 features, but LogisticRegression is expecting 18112 features as input.