# 0. Importing Dataset
<hr>


In [1]:
import os

%matplotlib inline
# import string
# from collections import deque
# 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# 
# # data
# from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
# from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
# 
# # Feature selection
# from sklearn.feature_selection import RFE, RFECV
from sklearn.impute import SimpleImputer
# 
# # classifiers / models
from sklearn.linear_model import LogisticRegression
# 
# # other
# from sklearn.metrics import (
#     accuracy_score,
#     log_loss,
#     make_scorer,
#     mean_squared_error,
#     precision_score,
#     recall_score,
#     f1_score
# )
from sklearn.model_selection import (
#     GridSearchCV,
#     RandomizedSearchCV,
#     ShuffleSplit,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
#     OrdinalEncoder,
#     PolynomialFeatures,
#     StandardScaler
)
# from sklearn.svm import SVC, SVR
chosen_seed = 2021

# 1. Import Data
<hr>

In [2]:
### BEGIN STARTER CODE

dataset_df = pd.read_csv("data/train.csv", usecols=["keyword", "location", "text", "target"])
train_df, test_df = train_test_split(dataset_df, test_size=0.2, random_state=chosen_seed)
train_df.head()

### BEGIN STARTER CODE

Unnamed: 0,keyword,location,text,target
1609,collapse,"Chicago, Illinois",Only one commodity has escaped the total colla...,0
1740,collision,'soooota,@Zojadelin you literally almost had a head on ...,1
3223,emergency%20services,"Nevada, USA",Can you recommend anyone for this #job? RN Eme...,0
282,apocalypse,Oakland,Julie + R is the apocalypse version of Romeo +...,0
2515,desolation,,RT @FreeDiscountBks: **Desolation Run** #FREE ...,0


In [3]:
text_feature = "text"
target = "target"

X_train, y_train = train_df.drop(columns=["target", "location"]), train_df[target]
X_test, y_test = test_df.drop(columns=["target", "location"]), test_df[target]

# 2. EDA
<hr>

- We have a class imbalance scenario.
- We want to predict as much positive cases as possible as this will help us to act preemptively to take measures in case if a disaster is eminent. However, too much of false positive can create unnecessary panic and people might not take the threat predictions seriously in future. Hence, we will be using **f1-score** as our evaluation metrics, so that we maximize precision and recall together, assuming that this threat prediction model will be used for both general and security purposes
    - However, if we are creating this model for security agencies, recall might be a better option as false positive won't hurt other than taking some extra precaution.
- We will be using `keyword` feature as categorical feature and we can apply one-hot encoding
- We can apply CountVectorizer on `text` feature which is already used in the starter code
- We will ignore `location` feature for now.

In [4]:
### YOUR ANSWER HERE
class_dist = train_df.value_counts("target").reset_index(name="count")
class_dist["perc"] = class_dist["count"]/train_df.shape[0]
class_dist

Unnamed: 0,target,count,perc
0,0,3500,0.574713
1,1,2590,0.425287


In [5]:
print(f"Number of unique keywords: {len(train_df.keyword.unique())}")

Number of unique keywords: 222


# 3. Housekeeping
<hr>

In [6]:
def summarize_cv_scores(X, classifier_name):
    """
    This function summarizes the output of cross_validate function 
    from sklearn.model_selection and provides the mean and 
    standard deviation of all columns.

    Parameters
    ----------
    X : dict
        The output of cross_validate function from sklearn.model_selection.

    classifier_name : string
        Name of the classifier

    Examples
    --------
    Constructing toy example for X dictionary.

    >>> toy_score = {
        "fit_time": np.array([0.1, 0.2, 0.3, 0.4, 0.5]),
        "score_time": np.array([1, 2, 3, 4, 5]),
        "test_accuracy": np.array([0.5, 0.5, 0.5, 0.5, 0.5]),
        "train_accuracy": np.array([0.5, 0.5, 0.5, 0.5, 0.5]),
        "test_f1": np.array([0.1, 0.1, 0.2, 0.1, 0.1]),
        "train_f1": np.array([0.1, 0.3, 0.1, 0.1, 0.1]),
    }
    
    Using the function
    
    >>> summarize_cv_scores(toy_score, "toy_test")
    """
    X_df = pd.DataFrame(X)
    col_names = (
        pd.Series(X_df.columns.tolist()).str.replace("test_", "validation_").tolist()
    )
    col_names = [f"{t}_{i}" for t in ["mean", "std"] for i in col_names]
    X_df = pd.DataFrame(pd.concat([X_df.mean(), X_df.std()])).T
    X_df.columns = col_names
    X_df["classifier_name"] = classifier_name
    col_names = ["classifier_name"] + col_names
    return X_df[col_names]


toy_score = {
    "fit_time": np.array([0.1, 0.2, 0.3, 0.4, 0.5]),
    "score_time": np.array([1, 2, 3, 4, 5]),
    "test_accuracy": np.array([0.5, 0.5, 0.5, 0.5, 0.5]),
    "train_accuracy": np.array([0.5, 0.5, 0.5, 0.5, 0.5]),
    "test_f1": np.array([0.1, 0.1, 0.2, 0.1, 0.1]),
    "train_f1": np.array([0.1, 0.3, 0.1, 0.1, 0.1]),
}

expected = {
    "classifier_name": ["toy_test"],
    "mean_fit_time": [0.3],
    "mean_score_time": [3],
    "mean_validation_accuracy": [0.5],
    "mean_train_accuracy": [0.5],
    "mean_validation_f1": [0.12],
    "mean_train_f1": [0.14],
    "std_fit_time": [0.158114],
    "std_score_time": [1.581139],
    "std_validation_accuracy": [0.0],
    "std_train_accuracy": [0.0],
    "std_validation_f1": [0.044721],
    "std_train_f1": [0.089443],
}


assert isinstance(
    summarize_cv_scores(toy_score, "toy_test"), pd.DataFrame
), "Check data structure"
assert (
    int(
        (
            np.round(summarize_cv_scores(toy_score, "toy_test"), 4)
            == np.round(pd.DataFrame(data=expected), 4)
        ).T.sum()
    )
    == 13
), "Check function logic"
print("Success!")

Success!


# 4. Baseline model
<hr>

In [7]:
model_dummy = DummyClassifier(strategy="stratified", random_state=chosen_seed)
scores = cross_validate(
    model_dummy,
    X_train,
    y_train,
    scoring=["f1", "accuracy"],
    cv=5,
    n_jobs=-1,
    return_train_score=True,
)

mean_scores_df = summarize_cv_scores(scores, "Dummy Classifier (stratified)")
mean_scores_df

Unnamed: 0,classifier_name,mean_fit_time,mean_score_time,mean_validation_f1,mean_train_f1,mean_validation_accuracy,mean_train_accuracy,std_fit_time,std_score_time,std_validation_f1,std_train_f1,std_validation_accuracy,std_train_accuracy
0,Dummy Classifier (stratified),0.00112,0.002309,0.440613,0.437563,0.520525,0.516872,0.000139,0.000131,0.013341,0.00522,0.011435,0.004484


# 5. Preprocessing
<hr>

In [8]:
preprocessor = make_column_transformer((CountVectorizer(max_features=20_000, stop_words="english"), "text"))

# 6. Logistic Regression
<hr>

In [9]:
model_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(random_state=chosen_seed, class_weight="balanced"),
)

scores = cross_validate(
    model_pipe,
    X_train,
    y_train,
    scoring=["f1", "accuracy"],
    cv=5,
    n_jobs=-1,
    return_train_score=True,
)

mean_scores_df = pd.concat(
    [mean_scores_df, summarize_cv_scores(scores, "Logistic Regression (without keyword)")]
)
mean_scores_df

Unnamed: 0,classifier_name,mean_fit_time,mean_score_time,mean_validation_f1,mean_train_f1,mean_validation_accuracy,mean_train_accuracy,std_fit_time,std_score_time,std_validation_f1,std_train_f1,std_validation_accuracy,std_train_accuracy
0,Dummy Classifier (stratified),0.00112,0.002309,0.440613,0.437563,0.520525,0.516872,0.000139,0.000131,0.013341,0.00522,0.011435,0.004484
0,Logistic Regression (without keyword),0.274897,0.018164,0.73968,0.972549,0.787028,0.976888,0.023235,0.001234,0.018167,0.001804,0.014694,0.001526


# 7. Include _keyword_ feature
<hr>

In [14]:
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = make_column_transformer(
    (CountVectorizer(max_features=20_000, stop_words="english"), "text"), (categorical_pipeline, ["keyword"])
)

In [15]:
model_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(random_state=chosen_seed, class_weight="balanced"),
)

In [16]:
scores = cross_validate(
    model_pipe,
    X_train,
    y_train,
    scoring=["f1", "accuracy"],
    cv=5,
    n_jobs=-1,
    return_train_score=True,
)

mean_scores_df = pd.concat(
    [mean_scores_df, summarize_cv_scores(scores, "Logistic Regression (with keyword)")]
)
mean_scores_df

Unnamed: 0,classifier_name,mean_fit_time,mean_score_time,mean_validation_f1,mean_train_f1,mean_validation_accuracy,mean_train_accuracy,std_fit_time,std_score_time,std_validation_f1,std_train_f1,std_validation_accuracy,std_train_accuracy
0,Dummy Classifier (stratified),0.00112,0.002309,0.440613,0.437563,0.520525,0.516872,0.000139,0.000131,0.013341,0.00522,0.011435,0.004484
0,Logistic Regression (without keyword),0.274897,0.018164,0.73968,0.972549,0.787028,0.976888,0.023235,0.001234,0.018167,0.001804,0.014694,0.001526
0,Logistic Regression (with keyword),0.000967,0.0,,,,,0.000125,0.0,,,,
0,Logistic Regression (with keyword),0.380465,0.027819,0.743785,0.972957,0.789163,0.977217,0.059621,0.005877,0.016025,0.001903,0.013231,0.001603


- Our scores (accuracy and f1) using Logistic Regression with keyword feature is better
- But, the scores (accuracy and f1) using Logistic Regression with keyword feature is almost similar to the results obtained using Logistic Regression without keyword feature 
    - This is because keywords are subset of the text and when we use count vectorizer on text, there are chances that we well get a feature with the keyword again. So we are not adding anything significant to our model.

### Exercise 1.6: Adding new features
rubric={reasoning:5}

Is it possible to further improve the scores? How about adding new features based on our intuitions? 

**Your tasks:**

1. Name 3 to 4 additional features you think would be helpful in predicting the target. An example would be a binary feature "has_emoticons" indicating whether the tweet has emoticons or not. Explain your intuition behind the features and discuss how hard in would be to engineer these features. 

**solution_1_6_1**

### YOUR ANSWER HERE

- A binary feature `has_hyperlinks` indicating whether the tweet has hyperlink or not.
    - A tweet of a disaster event is likely to have a hyperlink of the news or other related article.
- A binary feature `is_valid_location`
    - A disaster event is likely to affect a physical location and identifying the location can be of added value
- A binary feature `has_numbers`
    - A disaster event is likely to have number of people affected in a related tweet

### Exercise 1.7: Extracting your own features 
rubric={accuracy:4,reasoning:4}

In this exercise, we will be adding some very basic length-related and sentiment features.  

You will need to install a popular library called `nltk` for this exercise. For that, run the following commands in your `conda` environment. 

```
conda install -c anaconda nltk 
nltk.download("vader_lexicon")
nltk.download("punkt")
```        

Run the starter code below creates three new features: 
- Relative character length in the tweet. 
- Number of words in the tweet.
- Sentiment of the tweet (positive (pos), negative (neg), neutral (neu), compound (mixture of different sentiments)). In 571, you carried out sentiment analysis on the IMDB data set. Here we are using some pre-trained machine learning model to extract sentiment expressed in the tweets. 

**Your tasks:**

1. Extract at least two more features that you think might be relevant for prediction and store them as new columns in the train and test sets. Briefly explain your intuition on why these features might help the prediction task. 
2. Would it have been OK to create new columns directly in the original `df` instead of creating them separately for train and test splits? Would that be violation of the golden rule? 

In [17]:
### BEGIN STARTER CODE

import nltk

nltk.download("vader_lexicon")
nltk.download("punkt")
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

### END STARTER CODE

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/debananda/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/debananda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
### BEGIN STARTER CODE


def get_relative_length(text, TWITTER_ALLOWED_CHARS=280.0):
    """
    Returns the relative length of text.

    Parameters:
    ------
    text: (str)
    the input text

    Keyword arguments:
    ------
    TWITTER_ALLOWED_CHARS: (float)
    the denominator for finding relative length

    Returns:
    -------
    relative length of text: (float)

    """
    return len(text) / TWITTER_ALLOWED_CHARS


def get_length_in_words(text):
    """
    Returns the length of the text in words.

    Parameters:
    ------
    text: (str)
    the input text

    Returns:
    -------
    length of tokenized text: (int)

    """
    return len(nltk.word_tokenize(text))


def get_sentiment(text):
    """
    Returns the maximum scoring sentiment of the text

    Parameters:
    ------
    text: (str)
    the input text

    Returns:
    -------
    sentiment of the text: (str)
    """
    scores = sid.polarity_scores(text)
    return max(scores, key=lambda x: scores[x])


### YOUR ANSWER HERE

In [19]:
### BEGIN STARTER CODE

train_df = train_df.assign(n_words=train_df["text"].apply(get_length_in_words))
train_df = train_df.assign(sentiment=train_df["text"].apply(get_sentiment))
train_df = train_df.assign(rel_char_len=train_df["text"].apply(get_relative_length))

test_df = test_df.assign(n_words=test_df["text"].apply(get_length_in_words))
test_df = test_df.assign(sentiment=test_df["text"].apply(get_sentiment))
test_df = test_df.assign(rel_char_len=test_df["text"].apply(get_relative_length))

### END STARTER CODE

In [20]:
# solution_1_7_1

### YOUR ANSWER HERE
train_df = train_df.assign(
    has_hyperlinks=train_df["text"].str.contains("http://|https://").astype(int)
)
train_df = train_df.assign(
    has_numbers=train_df["text"].str.contains("[0-9]").astype(int)
)

test_df = test_df.assign(
    has_hyperlinks=test_df["text"].str.contains("http://|https://").astype(int)
)
test_df = test_df.assign(
    has_numbers=test_df["text"].str.contains("[0-9]").astype(int)
)

**solution_1_7_1 (reasoning)**

### YOUR ANSWER HERE

- A binary feature `has_hyperlinks` indicating whether the tweet has hyperlink or not.
    - A tweet of a disaster event is likely to have a hyperlink of the news or other related article.
- A binary feature `has_numbers`
    - A disaster event is likely to have number of people affected in a related tweet.

**solution_1_7_2**

### YOUR ANSWER HERE

In this case we will not violate the golden rule if we do the transformation in original `df` as all our transformations are row level transformations. However, it is better to avoid doing the transformation in the original df as we will not be losing our original untouched dataset.

### 1.8 Pipeline with all features
rubric={accuracy:4,reasoning:2}

**Your tasks:**
1. Identify different feature types in your new data set with the features you created above, and separate features and targets from your new dataset. 
2. Define a column transformer for your mixed feature types. Again, set `max_features` of `CountVectorizer` to 20_000.  
3. Define a pipeline with the column transformer and `LogisticRegression` with `class_weight` of `LogisticRegression` set to "balanced" and report mean cross-validation f1 scores.


In [21]:
# solution_1_8_1

### YOUR ANSWER HERE
X_train, y_train = train_df.drop(columns=["target"]), train_df[target]
X_test, y_test = test_df.drop(columns=["target"]), test_df[target]

numeric_features = ["n_words", "rel_char_len"]
categorical_features = ["keyword", "sentiment"]
text_features = "text"
binary_features = ["has_hyperlinks", "has_numbers"]
drop_features = ["location"]


In [22]:
# solution_1_8_2

### YOUR ANSWER HERE
numeric_pipeline = make_pipeline(SimpleImputer(), StandardScaler())
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
)

preprocessor = make_column_transformer(
    (numeric_pipeline, numeric_features),
    (SimpleImputer(strategy="most_frequent"), binary_features),
    (categorical_pipeline, categorical_features),
    (CountVectorizer(max_features=20_000), "text")
)

In [23]:
# solution_1_8_3

### YOUR ANSWER HERE

model_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(random_state=2, class_weight="balanced", max_iter=1000),
)

scores = cross_validate(
    model_pipe,
    X_train,
    y_train,
    scoring=["f1", "accuracy"],
    cv=5,
    n_jobs=-1,
    return_train_score=True
)
scores

mean_scores_df = pd.concat(
    [mean_scores_df, summarize_cv_scores(scores, "Logistic Regression (feat eng.)")]
)
mean_scores_df

Unnamed: 0,classifier_name,fit_time,score_time,validation_f1,train_f1,validation_accuracy,train_accuracy
0,Dummy Classifier (stratified),0.001346,0.002688,0.178843,0.183351,0.692064,0.695911
0,Logistic Regression (without keyword),0.722058,0.043308,0.689627,0.973009,0.883686,0.989666
0,Logistic Regression (with keyword),0.982626,0.056809,0.690259,0.973728,0.883686,0.989941
0,Logistic Regression (feat eng.),1.43942,0.059701,0.690929,0.972813,0.883576,0.989583


### 1.9 Interpretation
rubric={accuracy:4,reasoning:2}

1. Do you see any improvements with the new features compared to when you used only `CountVectorizer` features? Note that feature engineering is hard and requires domain expertise. If you do not see big improvements in scores with new features, that's OK. Do not get discouraged. The purpose of this exercise is to make you familiar to the process of extracting new features rather than getting the best scores. 
2. Show the first 20 coefficients with largest magnitudes and corresponding features. 
3. Examine the coefficients of the features we have extracted above. Do they make sense? 

**solution_1_9_1**

### YOUR ANSWER HERE

No, we do not see any significant improvement using the new features. 

In [24]:
# solution_1_9_2

### YOUR ANSWER HERE
model_pipe.fit(X_train, y_train)

cat_columns = (
    pd.Series(
        model_pipe.named_steps["columntransformer"]
        .named_transformers_["pipeline-2"]
        .named_steps["onehotencoder"]
        .get_feature_names()
    )
    .str.replace("x1_", "sentiment_")
    .str.replace("x0_", "keyword_")
    .tolist()
)

txt_columns = (
    model_pipe.named_steps["columntransformer"]
    .named_transformers_["countvectorizer"]
    .get_feature_names()
)

features = numeric_features + binary_features + cat_columns + txt_columns
coefficients = model_pipe.named_steps["logisticregression"].coef_.flatten()

model_interpret_df = pd.DataFrame(
    data={"features": features, "coefficients": coefficients}
)
model_interpret_df = model_interpret_df.assign(
    abs_coef=np.abs(model_interpret_df.coefficients)
)
model_interpret_df.sort_values(by="abs_coef", ascending=False).head(20)[
    ["features", "coefficients"]
]

Unnamed: 0,features,coefficients
17206,thunderstorm,2.065141
16525,survived,2.057184
5892,died,2.012474
3731,carried,1.877849
217,keyword_windstorm,1.847924
14735,scared,1.786221
16981,terrorists,1.732961
49,keyword_collapse,-1.650739
4447,collision,1.631677
14260,road,1.625345


**solution_1_9_3**

### YOUR ANSWER HERE

Some of the features and their corresponding coefficients make sense. For example, 
- we have a positive coefficient for negative sentiment and a negative coefficient for positive sentiment
- `has_numbers` have a positive coefficient which confirms our intuition.

However, the coefficients of engineered features are small indicating that their impact is not very significant in prediction. Hence, addition of these new features is not improving our validation score significantly.

In [25]:

model_interpret_df.loc[model_interpret_df.features.isin(
    [
        "n_words",
        "rel_char_len",
        "has_hyperlinks",
        "has_numbers",
        "sentiment_compound",
        "sentiment_neg",
        "sentiment_neu",
        "sentiment_pos",
    ]
), ["features", "coefficients"]]

Unnamed: 0,features,coefficients
0,n_words,-0.506661
1,rel_char_len,0.396628
2,has_hyperlinks,-0.461376
3,has_numbers,0.264
223,sentiment_compound,-0.553334
224,sentiment_neg,0.290422
225,sentiment_neu,0.345551
226,sentiment_pos,-0.095927


### 1.10 Test results
rubric={accuracy:2, reasoning:2}

**Yout tasks**

1. Report f1 score on the test set with the model trained with all features. 
2. What additional time, other than prediction time, do we need if we are to use this model with our engineered features on the deployment data?  

In [26]:
# solution_1_10_1

### YOUR ANSWER HERE

y_pred = model_pipe.predict(X_test)
print(f"f1-score on test set: {f1_score(y_test, y_pred):.4f}")

f1-score on test set: 0.7339


**solution_1_10_2**

### YOUR ANSWER HERE

In addition to prediction time, we have to create the engineered feature on deployment dataset.