In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from string import punctuation

%matplotlib inline

In [None]:
# Import pandas
# Use pandas to import tweets
tweets_path = '../data/airline_tweets.csv')
tweets = pd.read_csv(tweets_path, sep=',')

## Challenge 1: Getting to Know the Data

Use `pandas` to find out the following about the airline tweets:

* How many tweets are in the dataset?
* How many tweets are positive, neutral, and negative?
* What *proportion* of tweets are positive, neutral, and negative?
* Make a bar plot showing the proportion of tweet sentiments.

If you have time, try the following:

* How much time separates the earliest and latest tweets?
* What gets more retweets: positive, negative, or neutral tweets?
* Identify the airline whose tweets have the highest proportion of negative sentiment.

In [None]:
# How many tweets are in the dataset?
tweets.shape[0]

In [None]:
# How many tweets are positive, neutral, and negative?
tweets['airline_sentiment'].value_counts()

In [None]:
# What *proportion* of tweets are positive, neutral, and negative?
tweets['airline_sentiment'].value_counts(normalize=True)

In [None]:
# Make a bar plot showing the proportion of tweet sentiments
sns.countplot(x=tweets['airline_sentiment'], order=['positive', 'neutral', 'negative'])

In [None]:
# How much time separates the earliest and latest tweets?
sorted_by_time = pd.to_datetime(tweets['tweet_created'].sort_values())
sorted_by_time.iloc[-1] - sorted_by_time.iloc[0]

In [None]:
# What gets more retweets: positive, negative, or neutral tweets?
tweets.groupby('airline_sentiment')['retweet_count'].mean()

In [None]:
# Which airline receives the highest proportion of negative tweets?
proportions = tweets.groupby(['airline', 'airline_sentiment']).size() / tweets.groupby('airline').size()
proportions.unstack().sort_values('negative')

## Challenge 2: Creating a Preprocessing Pipeline for Social Media Data

Write a function called `preprocess()` that performs the following on a text input:

* Lowercase text.
* Replace all URLs with the token "URL".
* Replace all numbers with the token "DIGIT".
* Replace hashtags with the token "HASHTAG".
* Replace all users with the token "USER".
* Remove blankspaces.

We have provided regex patterns for each of the replacement steps in the following cells.

Run your `preprocess()` function on `example_tweet` (two cells below), and when you think you have it working, apply it to the entire `text` column in the tweets DataFrame.

In [None]:
# Apply your function to the following example
example_tweet = "lol @justinbeiber and @BillGates are like soo 2000 #yesterday #amiright saw it on https://twitter.com #yolo"

In [None]:
def preprocess(text):
    """Preprocesses a string."""
    # Lowercase
    text = text.lower()
    # Replace URLs
    url_pattern = r'https?:\/\/.*[\r\n]*'
    url_repl = ' URL '
    text = re.sub(url_pattern, url_repl, text)
    # Replace digits
    digit_pattern = '\d+'
    digit_repl = ' DIGIT '
    text = re.sub(digit_pattern, digit_repl, text)
    # Replace hashtags
    hashtag_pattern = r'(?:^|\s)[＃#]{1}(\w+)'
    hashtag_repl = ' HASHTAG '
    text = re.sub(hashtag_pattern, hashtag_repl, text)
    # Replace users
    user_pattern = r'@(\w+)'
    user_repl = ' USER '
    text = re.sub(user_pattern, user_repl, text)
    # Remove blank spaces
    blankspace_pattern = r'\s+'
    blankspace_repl = ' '
    text = re.sub(blankspace_pattern, blankspace_repl, text).strip()
    return text

In [None]:
# Test on example tweet
preprocess(example_tweet)

In [None]:
# Apply to text column to create a new column
tweets['text_processed'] = tweets['text'].apply(lambda x: preprocess(x))
tweets['text_processed'].head()

## Challenge 3: DTM Data Analysis

* Print out the most infrequent words rather than the most frequent words. If you're not sure how, check the [documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html)!
* Print the average number of times each word is used in a tweet.
* Which non-hashtag, non-digit token appears the most in any given tweet? How many times does it appear? What is the original tweet?

In [None]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(tweets['text_processed'])
# Extract tokens
tokens = vectorizer.get_feature_names_out()
# Create DTM
dtm = pd.DataFrame(data=counts.todense(),
                   index=tweets.index,
                   columns=tokens)

In [None]:
# Most infrequent tokens
dtm.sum().sort_values(ascending=True).head(20)

In [None]:
# Average number of times each word is used in a tweet
dtm.mean().sort_values(ascending=False).head(20)

In [None]:
# Which token appears the most in any given tweet?
counts = pd.DataFrame()
counts['token'] = dtm.idxmax(axis=1)
counts['number'] = dtm.max(axis=1)
counts[(counts['token'] != 'digit')
       & (counts['token'] != 'hashtag')
       & (counts['token'] != 'user')].sort_values(
    'number',
    ascending=False).head(10)

In [None]:
# Look at index 1214: "worst"
tweets.iloc[1214]['text']

In [None]:
# Look at index 3915: "lt"
tweets.iloc[3915]['text']

## Challenge 4: Customizing the Vectorizer with `nltk` inputs

If you look at the `CountVectorizer` documentation, you'll see that it can actually accept a custom `tokenizer` and `stop_words` list. 

Using what you learned in the previous workshop, create a `CountVectorizer` that utilizes the `nltk` word tokenizer and stop word list. How does the resulting DTM look different?

In [None]:
# Get stop words
stop_words = stopwords.words('english')
# Create the vectorizer
vectorizer = CountVectorizer(
    lowercase=True,
    tokenizer=word_tokenize,
    stop_words=stop_words,
    min_df=2,
    max_df=0.95)
# Fit, transform, and get tokens
counts = vectorizer.fit_transform(tweets['text_processed'])
tokens = vectorizer.get_feature_names_out()
# Create dataframe
dtm = pd.DataFrame(data=counts.todense(),
                   index=tweets.index,
                   columns=tokens)
print(dtm.shape)
dtm.head()

## Challenge 5

Try developing a **multinomial logistic regression** model, to predict positive, negative, and neutral labels. We've provided you a fitter function below, but it's up to you to create new labels, train-test splits, and perform the fitting and evaluation!

In [None]:
def fit_multinomial_logistic_regression(X, y):
    """Fits a logistic regression model to provided data."""
    model = LogisticRegressionCV(
        multi_class='multinomial',
        Cs=10,
        penalty='l1',
        solver='saga',
        tol=1e-2,
        max_iter=50,
        cv=3,
        refit=True).fit(X, y)
    return model

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
dtm = vectorizer.fit_transform(tweets['text_processed'])
X = np.asarray(dtm.todense())
y = tweets['airline_sentiment']
print(X.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# This may take a while to run!
model = fit_multinomial_logistic_regression(X_train, y_train)

In [None]:
print(f"Training accuracy: {model.score(X_train, y_train)}")
print(f"Test accuracy: {model.score(X_test, y_test)}")

## Challenge 6

Create a new fitter function that uses a `RandomForestClassifier`. How is the performance? Check the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) for more details.

In [None]:
def fit_random_forest(X, y):
    """Fits a random forest model to provided data."""
    model = RandomForestClassifier(n_estimators=50).fit(X, y)
    return model

In [None]:
tweets_binary = tweets[tweets['airline_sentiment'] != 'neutral']
vectorizer = TfidfVectorizer(max_features=5000)
dtm = vectorizer.fit_transform(tweets_binary['text_processed'])
X = np.asarray(dtm.todense())
y = tweets_binary['airline_sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
model = fit_random_forest(X_train, y_train)

In [None]:
# Overfitting a bit!
print(f"Training accuracy: {model.score(X_train, y_train)}")
print(f"Test accuracy: {model.score(X_test, y_test)}")