In [2]:
import pandas as pd
import base64
import tweepy as tw
import re
import numpy as np
import string
import unicodedata
import nltk
import gensim
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis
import os
import matplotlib.pyplot as plt
import gensim.corpora as corpora
import streamlit as st
from pprint import pprint
from nltk.util import bigrams
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from streamlit_metrics import metric, metric_row
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('stopwords')

  def _figure_formats_changed(self, name, old, new):
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\domen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Step 1: Set up Twitter API access
Set up the project here: https://developer.twitter.com/en/portal/projects-and-apps

Using this site as reference: https://www.earthdatascience.org/courses/use-data-open-source-python/intro-to-apis/twitter-data-in-python/

In [3]:
# English stopwords
stopwords_en = nltk.corpus.stopwords.words('english')

# French stopwords
stopwords_fr = nltk.corpus.stopwords.words('french')

# Function 1
#-----------------
def get_table_download_link(df):
    # Reference: https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
    """
    Generates a link allowing the data in a given panda dataframe to be downloaded
    in:  dataframe
    out: href string
    """
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()  # some strings <-> bytes conversions necessary here
    href = f'<a href="data:file/csv;base64,{b64}" download="tweets.csv">Download CSV file</a>'
    return href

  and should_run_async(code)


In [80]:
import yaml


# Function 2: 
#----------------
# Hit twitter api & add basic features & output 2 dataframes
# @st.cache(suppress_st_warning=True,allow_output_mutation=True)
def twitter_get(select_hashtag_keyword, select_language, user_word_entry, num_of_tweets):  
    
    # Set up Twitter API access
    # Define access keys and tokens
    #consumer_key = st.secrets['consumer_key']
    #consumer_secret = st.secrets['consumer_secret']
    #access_token = st.secrets['access_token']
    #access_token_secret = st.secrets['access_token_secret']
    
    # Reference: https://gist.github.com/radcliff/47af9f6238c95f6ae239
    # Load yml file to dictionary :)
    credentials = yaml.load(open('./credentials.yml'), Loader=yaml.FullLoader)

    # Define access keys and tokens
    consumer_key = credentials['dom_twitter_api']['consumer_key']
    consumer_secret = credentials['dom_twitter_api']['consumer_secret']
    access_token = credentials['dom_twitter_api']['access_token']
    access_token_secret = credentials['dom_twitter_api']['access_token_secret']

    auth = tw.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tw.API(auth, wait_on_rate_limit = True)
    
    # Keyword or hashtag
    if select_hashtag_keyword == 'Hashtag':
        user_word = '#' + user_word_entry
    else:
        user_word = user_word_entry

    # Retweets (assumes yes)
    user_word = user_word + ' -filter:retweets'
    # The following is based on user language selection

    # ...English Language
    if select_language == 'English':
        language = 'en'

    # ...French Language
    if select_language == 'French':
        language = 'fr'

    # Retweets (assumes yes)
    user_word = user_word + ' -filter:retweets'

    # Scenario 1: All languages
    if select_language == 'All':
        tweets = tw.Cursor(api.search,
                            q=user_word,
                            tweet_mode = "extended").items(num_of_tweets)

    # Scenario 2: Specific language (English or French)
    if select_language != 'All':
        tweets = tw.Cursor(api.search,
                            q=user_word,
                            tweet_mode = "extended",
                            lang=language).items(num_of_tweets)

    # Store as dataframe
    tweet_metadata = [[tweet.created_at, tweet.id, tweet.full_text, tweet.user.screen_name, tweet.retweet_count, tweet.favorite_count] for tweet in tweets]
    df_tweets = pd.DataFrame(data=tweet_metadata, columns=['created_at', 'id', 'full_text', 'user', 'rt_count', 'fav_count'])

    # Add a new data variable
    df_tweets['created_dt'] = df_tweets['created_at'].dt.date

    # Add a new time variable
    df_tweets['created_time'] = df_tweets['created_at'].dt.time

    # Create a new text variable to do manipulations on 
    df_tweets['clean_text'] = df_tweets.full_text


    df_new = df_tweets[["created_dt", "created_time", "full_text", "user", "rt_count", "fav_count"]]
    df_new = df_new.rename(columns = {"created_dt": "Date", 
                                 "created_time": "Time", 
                                  "full_text": "Tweet", 
                                  "user": "Username", 
                                  "rt_count": "Retweets",  
                                  "fav_count": "Favourites"})

    return df_tweets, df_new

# Function 2: 
#----------------
# takes in pandas dataframe after first twitter scrape
# returns a pandas dataframe that has classified each tweet as relating to an nhl team

def classify_nhl_team(df):
    
    # Create a new and smaller dataframe to work with called df
    df = df[["id", "user", "created_at", "full_text", "clean_text"]]
    # Convert tweet to lower
    df.clean_text = df.clean_text.str.lower()  
    # Classification: If a team's keywords come up, classify as a team specific indicator, with value = team name
    df['ANA'] = pd.np.where(df['clean_text'].str.contains('anaheim|ducks|#flytogether'), 'Anaheim Ducks', '0')
    df['ARZ'] = pd.np.where(df['clean_text'].str.contains('arizona|coyotes|#yotes'), 'Arizona Coyotes', '0')
    df['BOS'] = pd.np.where(df['clean_text'].str.contains('boston|bruins|#nhlbruins'), 'Boston Bruins', '0')
    df['BUF'] = pd.np.where(df['clean_text'].str.contains('buffalo|sabres|#letsgobuffalo'), 'Buffalo Sabres', '0')
    df['CGY'] = pd.np.where(df['clean_text'].str.contains('calgary|flames|#cofred'), 'Calgary Flames', '0')
    df['CAR'] = pd.np.where(df['clean_text'].str.contains('carolina|hurricanes|#canes|#letsgocanes'), 'Carolina Hurricanes', '0')
    df['CHI'] = pd.np.where(df['clean_text'].str.contains('chicago|blackhawks|#blackhawks'), 'Chicago Blackhawks', '0')
    df['COL'] = pd.np.where(df['clean_text'].str.contains('colorado|avalanche|#GoAvsGo'), 'Colorado Avalanche', '0')
    df['CBJ'] = pd.np.where(df['clean_text'].str.contains('columbus|bluejackets|jackets|#CBJ'), 'Columbus Blue Jackets', '0')
    df['DAL'] = pd.np.where(df['clean_text'].str.contains('dallas|stars|#gostars'), 'Dallas Stars', '0')
    df['DET'] = pd.np.where(df['clean_text'].str.contains('detroit|redwings|#lgrw'), 'Detroit Red Wings', '0')
    df['EDM'] = pd.np.where(df['clean_text'].str.contains('edmonton|oilers|#oilers'), 'Edmonton Oilers', '0')
    df['FLA'] = pd.np.where(df['clean_text'].str.contains('florida|panthers|#flapanthers'), 'Florida Panthers', '0')
    df['LAK'] = pd.np.where(df['clean_text'].str.contains('los angeles|kings|#gokingsgo'), 'Los Angeles Kings', '0')
    df['MIN'] = pd.np.where(df['clean_text'].str.contains('minnesota|wild|#mnwild'), 'Minnesota Wild', '0')
    df['MTL'] = pd.np.where(df['clean_text'].str.contains('montreal|canadiens|habs|#gohabsgo'), 'Montreal Canadiens', '0')
    df['NSH'] = pd.np.where(df['clean_text'].str.contains('nashville|predators|#preds'), 'Nashville Predators', '0')
    df['NJD'] = pd.np.where(df['clean_text'].str.contains('new jersey|devils|#njdevils'), 'New Jersey Devils', '0')
    df['NYI'] = pd.np.where(df['clean_text'].str.contains('new york islanders|islanders|#isles'), 'New York Islanders', '0')
    df['NYR'] = pd.np.where(df['clean_text'].str.contains('new york rangers|rangers|#nyr'), 'New York Rangers', '0')
    df['OTT'] = pd.np.where(df['clean_text'].str.contains('ottawa|senators|sens|#gosensgo'), 'Ottawa Senators', '0')
    df['PHI'] = pd.np.where(df['clean_text'].str.contains('philadelphia|flyers|#anytimeanywhere'), 'Philadelphia Flyers', '0')
    df['PIT'] = pd.np.where(df['clean_text'].str.contains('pittsburgh|penguins|#pens|#letsgopens'), 'Pittsburgh Penguins', '0')
    df['SJS'] = pd.np.where(df['clean_text'].str.contains('san jose|sharks|#sjsharks'), 'San Jose Sharks', '0')
    df['SEA'] = pd.np.where(df['clean_text'].str.contains('seattle|kraken|#seakraken'), 'Seattle Kraken', '0')
    df['STL'] = pd.np.where(df['clean_text'].str.contains('stlouis|st. louis|st louis|blues|#stblues'), 'St Louis Blues', '0')
    df['TBL'] = pd.np.where(df['clean_text'].str.contains('tampa bay|lightning|tampa|#gobolts'), 'Tampa Bay Lightning', '0')
    df['TOR'] = pd.np.where(df['clean_text'].str.contains('toronto|maple leafs|#leafsforever'), 'Toronto Maple Leafs', '0')
    df['VAN'] = pd.np.where(df['clean_text'].str.contains('vancouver|canucks|#canucks'), 'Vancouver Canucks', '0') 
    df['VGK'] = pd.np.where(df['clean_text'].str.contains('vegas|golden knights|knights|#vegasborn'), 'Vegas Golden Knights', '0')
    df['WSH'] = pd.np.where(df['clean_text'].str.contains('washington|capitals|#caps|#allcaps'), 'Washington Capitals', '0')
    df['WPG'] = pd.np.where(df['clean_text'].str.contains('winnipeg|jets|#gojetsgo'), 'Winnipeg Jets', '0')

    # Define columns to concatenate
    cols = ['ANA', 'ARZ', 'BOS', 'BUF', 'CGY', 'CAR', 'CHI', 'COL', 'CBJ', 'DAL', 'DET', 'EDM', 'FLA', 'LAK', 'MIN', 'MTL', 'NSH', 'NJD', 'NYI', 'NYR', 'OTT', 'PHI', 'PIT', 'SJS', 'SEA', 'STL', 'TBL', 'TOR', 'VAN', 'VGK', 'WSH', 'WPG']
    # Concatenate columns
    df['teams_concat'] = df[cols].apply(lambda x: ','.join(x), axis=1)
    # Replace 0s with nothing
    df['teams_concat'] = df.teams_concat.str.replace('0,|0|,0','').str.strip()
    # ind variable - if multiple commas exist (proxy for num of teams), then 1 else 0
    df['multiple_teams'] = np.where(df.teams_concat.str.contains(","), 1, 0)
    # ind variable - if the length of teams_concat is equal to 0 (proxy for no teams matched), then 1 else 0
    df['no_matches'] = np.where(df.teams_concat.str.len() == 0, 1, 0)

    # Stash a dataframe with those tweets that were never paired to a keyword 
    df_nomatch = df.loc[df['no_matches'] == 1]
    # Select columns
    df_nomatch = df_nomatch[['id', 'user', 'created_at', 'full_text', 'clean_text', 'multiple_teams', 'no_matches', 'teams_concat']]

    # Melt the dataframe such that each row is equal to a tweet that was matched to a team's keyword (introducing dups to tweets)
    melted_df = df.melt(
                    id_vars = ['id', 'user', 'created_at', 'full_text', 'clean_text', 'multiple_teams', 'no_matches', 'teams_concat'],
                    value_vars = ['ANA', 'ARZ', 'BOS', 'BUF', 'CGY', 'CAR', 'CHI', 'COL', 'CBJ', 'DAL', 'DET', 'EDM', 'FLA', 'LAK', 'MIN', 'MTL', 'NSH', 'NJD', 'NYI', 'NYR', 'OTT', 'PHI', 'PIT', 'SJS', 'SEA', 'STL', 'TBL', 'TOR', 'VAN', 'VGK', 'WSH', 'WPG'],
                    var_name = 'nhl_team_abbr',
                    value_name = "nhl_team"
                    )
    # Filter out 0s 
    melted_df = melted_df.loc[melted_df['nhl_team'] != '0']

    # Add feature parity to df_nomatch
    df_nomatch['nhl_team_abbr'] = 'Unknown' 
    df_nomatch['nhl_team'] = 'Unknown' 

    # Append df_nomatch to melted_df to get df_clean
    df_clean = melted_df.append(df_nomatch)

    # Show a few rows of data
    df_clean.head(5)
    #print('total rows:', len(df_clean),'melted_rows:', (len(melted_df)), 'nomatch_rows:', len(df_nomatch))

    # Extend df_clean by joining in data about the team
    df_merged = pd.merge(df_clean,
                        teams,
                        on = 'nhl_team',
                        how = 'left',
                        indicator = True)

    return df_merged

#classify_nhl_team(df_tweets).head(5)



  and should_run_async(code)


In [81]:
#Read in teams & accounts CSVs
teams = pd.read_csv('nhl_app_teams.csv')
accounts = pd.read_csv('nhl_app_accounts.csv')

# if team is kraken, then kraken else rest of league
#teams['expansion_type'] = np.where(teams.nhl_team.str.contains("Kraken"), "Kraken", "Rest of League")

teams.head(3)

  and should_run_async(code)


Unnamed: 0,twitter_handle,nhl_team,conference,2022_division,2021_division,expansion_type,2021_hashtag,other_hashtag,image,primary_hex,secondary_hex,tertiary_hex
0,@AnaheimDucks,Anaheim Ducks,Westernern,Pacific,West,Rest of League,#FlyTogether,#ducks,http://www.capsinfo.com/images/NHL_Team_Logos/...,#F47A38,#B9975B,#C1C6C8
1,@ArizonCoyotes,Arizona Coyotes,Western,Central,West,Rest of League,#Yotes,#coyotes,http://www.capsinfo.com/images/NHL_Team_Logos/...,#8C2633,#E2D6B5,#111111
2,@NHLBruins,Boston Bruins,Eastern,Atlantic,East,Rest of League,#NHLBruins,#bruins,http://www.capsinfo.com/images/NHL_Team_Logos/...,#FFB81C,#000000,


# Step 2: Get tweets and transform into dataframe

In [82]:
select_hashtag_keyword = "Keyword"
user_word_entry = "NHL Expansion Draft"
select_language = "English"
num_of_tweets = 1000

# Run function 2: Get twitter data 
df_tweets, df_new = twitter_get(select_hashtag_keyword, select_language, user_word_entry, num_of_tweets)

# Run function 3: Get classified nhl teams data
df_nhl = classify_nhl_team(df_tweets)

df_nhl.head(5)

  and should_run_async(code)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
  df['ANA'] = pd.np.where(df['clean_text'].str.contains('anaheim|ducks|#flytogether'), 'Anaheim Ducks', '0')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ANA'] = pd.np.where(df['clean_text'].str.contains('anaheim|ducks|#flytogether'), 'Anaheim Ducks', '0')
  df['ARZ'] = pd.np.where(df['clean_text'].str.contains('arizona|coyotes|#yotes'), 'Arizona Coyotes', '0')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

Unnamed: 0,id,user,created_at,full_text,clean_text,multiple_teams,no_matches,teams_concat,nhl_team_abbr,nhl_team,...,2022_division,2021_division,expansion_type,2021_hashtag,other_hashtag,image,primary_hex,secondary_hex,tertiary_hex,_merge
0,1413869071695814658,CBJcoverage,2021-07-10 14:33:49,Friedman: Jack Eichel trade now expected after...,friedman: jack eichel trade now expected after...,1,0,"Anaheim Ducks,Boston Bruins,Calgary Flames,Los...",ANA,Anaheim Ducks,...,Pacific,West,Rest of League,#FlyTogether,#ducks,http://www.capsinfo.com/images/NHL_Team_Logos/...,#F47A38,#B9975B,#C1C6C8,both
1,1413732444151422978,NHL_Watcher,2021-07-10 05:30:55,Friedman in 33 Thoughts writes a Jack Eichel t...,friedman in 33 thoughts writes a jack eichel t...,1,0,"Anaheim Ducks,Boston Bruins,Calgary Flames,Los...",ANA,Anaheim Ducks,...,Pacific,West,Rest of League,#FlyTogether,#ducks,http://www.capsinfo.com/images/NHL_Team_Logos/...,#F47A38,#B9975B,#C1C6C8,both
2,1413974338173292550,hockeyaddicts,2021-07-10 21:32:07,"NHL Rumors: Oilers, Blackhawks, Flyers, Sabres...","nhl rumors: oilers, blackhawks, flyers, sabres...",1,0,"Arizona Coyotes,Buffalo Sabres,Carolina Hurric...",ARZ,Arizona Coyotes,...,Central,West,Rest of League,#Yotes,#coyotes,http://www.capsinfo.com/images/NHL_Team_Logos/...,#8C2633,#E2D6B5,#111111,both
3,1413270173352562690,ClubsCrimson,2021-07-08 22:54:01,NHL Draft Order is set. Buffalo Sabres pick 1s...,nhl draft order is set. buffalo sabres pick 1s...,1,0,"Arizona Coyotes,Buffalo Sabres,Seattle Kraken",ARZ,Arizona Coyotes,...,Central,West,Rest of League,#Yotes,#coyotes,http://www.capsinfo.com/images/NHL_Team_Logos/...,#8C2633,#E2D6B5,#111111,both
4,1412846290698969092,KrakenChronicle,2021-07-07 18:49:39,The next look at who the #SeaKraken could take...,the next look at who the #seakraken could take...,1,0,"Arizona Coyotes,Seattle Kraken",ARZ,Arizona Coyotes,...,Central,West,Rest of League,#Yotes,#coyotes,http://www.capsinfo.com/images/NHL_Team_Logos/...,#8C2633,#E2D6B5,#111111,both


# Step 3: Basic exploratory data analysis (EDA)

In [7]:
print('Rows:', df_tweets_raw.shape[0], '\nColumns:', df_tweets_raw.shape[1])

  and should_run_async(code)


NameError: name 'df_tweets_raw' is not defined

In [None]:
max_tweet_length = df_tweets_raw.full_text.apply(len).max()
print('Longest tweet is', max_tweet_length, 'characters long')

In [None]:
min_date = df_tweets_raw.created_at.min()
max_date = df_tweets_raw.created_at.max()

print('Min. date: ', min_date, '\nMax. date: ', max_date)

In [None]:
df_tweets_raw.describe(include='all')

# Step 4: Feature Extraction (before text cleaning)
* Count of Stopwords
* Count of @ characters
* Count of Hashtag characters
* Count of Numeric characters
* Count of Punctuation
* Count of Emojis 😜
* Count of Emoticons :-)

In [None]:
# Load libraries
#!pip install -q wordcloud
#import wordcloud
from nltk.corpus import stopwords
import nltk
import string
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stop = stopwords.words('english')

In [25]:
# Function 4
#-----------------
def feature_extract(df):
    #TODO: add emoticons and emojis to this! and other punctuation

    # Create pre-clean character count feature
    df['character_ct'] = df.full_text.apply(lambda x: len(x))
    # Create stopword count features (english and french)
    df['stopword_en_ct'] = df.full_text.apply(lambda x: len([x for x in x.split() if x in stopwords_en]))
    df['stopword_fr_ct'] = df.full_text.apply(lambda x: len([x for x in x.split() if x in stopwords_fr]))
    # Create hashtag count feature
    df['hashtag_ct'] = df.full_text.apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    # Create link count feature
    df['link_ct'] = df.full_text.apply(lambda x: len([x for x in x.split() if x.startswith('https')]))
    # Create @ sign count feature
    df['atsign_ct'] = df.full_text.apply(lambda x: len([x for x in x.split() if x.startswith('@')]))
    # Create numeric count feature
    df['numeric_ct'] = df.full_text.apply(lambda x: len([x for x in x.split() if x.isdigit()]))
    # Create an uppercase count feature
    df['uppercase_ct'] = df.full_text.apply(lambda x: len([x for x in x.split() if x.isupper()]))
    return df

# Function 5a
#-------------
def round1_text_clean(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) # remove emoji
    text = ' ' + text # added space because there was some weirdness for first word (strip later)
    text = text.lower() # convert all text to lowercase
    text = re.sub(r'(\s)@\w+', '', text) # remove whole word if starts with @
    text = re.sub(r'(\s)\w*\d\w*\w+', '', text) # remove whole word if starts with number
    text = re.sub(r'https\:\/\/t\.co\/*\w*', '', text) # remove https links
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # removes punctuation
    text = re.sub('\[.*?\]', '', text) # removes text in square brackets
    text = text.replace("’s", '') # replace apostrophes with empty string
    text = text.replace("'s", '') # replace apostrophes with empty string
    #text = re.sub('\w*\d\w*', '', text) # remove whole word if starts with number
    #text = re.sub(r'(\s)#\w+', '', text) # remove whole word if starts with #
    text = text.strip() # strip text
    return text

# Function 5b
#-------------
text_clean_round1 = lambda x: round1_text_clean(x)

# Function 6
#-------------
def text_clean_round2(text):
    """
    A simple function to clean up the data. All the words that
    are not designated as a stop word is then lemmatized after
    encoding and basic regex parsing are performed.
    """
    nltk.download('wordnet')
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')
    text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore'))
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

# Function 7
#-------------
def text_clean_round3(text):
    #TODO: add emoticons and emojis to this!
    # Load in stopwords
    stopwords_en = nltk.corpus.stopwords.words('english')
    stopwords_fr = nltk.corpus.stopwords.words('french')
    stopwords = stopwords_en + stopwords_fr
    # Create pre-clean character count feature
    text = text.apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))
    return text

  and should_run_async(code)
  text = re.sub('\[.*?\]', '', text) # removes text in square brackets


In [28]:
# Run function #4: Feature extraction
df_nhl = feature_extract(df_nhl)

# Run function #5: Round 1 text cleaning (convert to lower, remove numbers, @, punctuation, numbers. etc.)
df_nhl['clean_text'] = df_nhl.clean_text.apply(text_clean_round1)

## Run function #7: Round 3 text cleaning (remove stop words)
df_nhl.clean_text  = text_clean_round3(df_nhl.clean_text)

df_nhl.head(5)

  and should_run_async(code)


Unnamed: 0,id,user,created_at,full_text,clean_text,multiple_teams,no_matches,teams_concat,nhl_team_abbr,nhl_team,...,tertiary_hex,_merge,character_ct,stopword_en_ct,stopword_fr_ct,hashtag_ct,link_ct,atsign_ct,numeric_ct,uppercase_ct
0,1413869071695814658,CBJcoverage,2021-07-10 14:33:49,Friedman: Jack Eichel trade now expected after...,friedman jack eichel trade expected expansion ...,1,0,"Anaheim Ducks,Boston Bruins,Calgary Flames,Los...",ANA,Anaheim Ducks,...,#C1C6C8,both,210,8,1,1,1,0,0,1
1,1413732444151422978,NHL_Watcher,2021-07-10 05:30:55,Friedman in 33 Thoughts writes a Jack Eichel t...,friedman thoughts writes jack eichel trade lik...,1,0,"Anaheim Ducks,Boston Bruins,Calgary Flames,Los...",ANA,Anaheim Ducks,...,#C1C6C8,both,224,13,1,0,0,0,1,1
2,1411070238150434818,mayorNHL,2021-07-02 21:12:15,(new post) NHL RADIO REPLAY: Mayor’s Minutes\n...,new post nhl radio replay mayor minutes topics...,1,0,"Anaheim Ducks,Los Angeles Kings,Seattle Kraken",ANA,Anaheim Ducks,...,#C1C6C8,both,281,1,0,0,2,0,0,7
3,1413974338173292550,hockeyaddicts,2021-07-10 21:32:07,"NHL Rumors: Oilers, Blackhawks, Flyers, Sabres...",nhl rumors oilers blackhawks flyers sabres hur...,1,0,"Arizona Coyotes,Buffalo Sabres,Carolina Hurric...",ARZ,Arizona Coyotes,...,#111111,both,269,6,0,0,2,0,0,2
4,1413270173352562690,ClubsCrimson,2021-07-08 22:54:01,NHL Draft Order is set. Buffalo Sabres pick 1s...,nhl draft order set buffalo sabres pick expans...,1,0,"Arizona Coyotes,Buffalo Sabres,Seattle Kraken",ARZ,Arizona Coyotes,...,#111111,both,250,10,1,0,1,1,0,2


## Part 5: Sentiment Analysis
* Import VADER functions
* Apply VADER sentiment analyzer to raw tweets, create output columns

In [84]:
# Create a copy to preserve the raw data
df_nhl_copy = df_nhl.copy()

df_nhl_copy.head()

  and should_run_async(code)


Unnamed: 0,id,user,created_at,full_text,clean_text,multiple_teams,no_matches,teams_concat,nhl_team_abbr,nhl_team,...,2022_division,2021_division,expansion_type,2021_hashtag,other_hashtag,image,primary_hex,secondary_hex,tertiary_hex,_merge
0,1413869071695814658,CBJcoverage,2021-07-10 14:33:49,Friedman: Jack Eichel trade now expected after...,friedman: jack eichel trade now expected after...,1,0,"Anaheim Ducks,Boston Bruins,Calgary Flames,Los...",ANA,Anaheim Ducks,...,Pacific,West,Rest of League,#FlyTogether,#ducks,http://www.capsinfo.com/images/NHL_Team_Logos/...,#F47A38,#B9975B,#C1C6C8,both
1,1413732444151422978,NHL_Watcher,2021-07-10 05:30:55,Friedman in 33 Thoughts writes a Jack Eichel t...,friedman in 33 thoughts writes a jack eichel t...,1,0,"Anaheim Ducks,Boston Bruins,Calgary Flames,Los...",ANA,Anaheim Ducks,...,Pacific,West,Rest of League,#FlyTogether,#ducks,http://www.capsinfo.com/images/NHL_Team_Logos/...,#F47A38,#B9975B,#C1C6C8,both
2,1413974338173292550,hockeyaddicts,2021-07-10 21:32:07,"NHL Rumors: Oilers, Blackhawks, Flyers, Sabres...","nhl rumors: oilers, blackhawks, flyers, sabres...",1,0,"Arizona Coyotes,Buffalo Sabres,Carolina Hurric...",ARZ,Arizona Coyotes,...,Central,West,Rest of League,#Yotes,#coyotes,http://www.capsinfo.com/images/NHL_Team_Logos/...,#8C2633,#E2D6B5,#111111,both
3,1413270173352562690,ClubsCrimson,2021-07-08 22:54:01,NHL Draft Order is set. Buffalo Sabres pick 1s...,nhl draft order is set. buffalo sabres pick 1s...,1,0,"Arizona Coyotes,Buffalo Sabres,Seattle Kraken",ARZ,Arizona Coyotes,...,Central,West,Rest of League,#Yotes,#coyotes,http://www.capsinfo.com/images/NHL_Team_Logos/...,#8C2633,#E2D6B5,#111111,both
4,1412846290698969092,KrakenChronicle,2021-07-07 18:49:39,The next look at who the #SeaKraken could take...,the next look at who the #seakraken could take...,1,0,"Arizona Coyotes,Seattle Kraken",ARZ,Arizona Coyotes,...,Central,West,Rest of League,#Yotes,#coyotes,http://www.capsinfo.com/images/NHL_Team_Logos/...,#8C2633,#E2D6B5,#111111,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1413732444151422978,NHL_Watcher,2021-07-10 05:30:55,Friedman in 33 Thoughts writes a Jack Eichel t...,friedman in 33 thoughts writes a jack eichel t...,1,0,"Anaheim Ducks,Boston Bruins,Calgary Flames,Los...",LAK,Los Angeles Kings,...,Pacific,West,Rest of League,#GoKingsGo,#kings,http://www.capsinfo.com/images/NHL_Team_Logos/...,#111111,#A2AAAD,#572A84,both
96,1414305376984522762,muggsy34,2021-07-11 19:27:33,@RussoHockey I just noticed how “Minnesota” th...,@russohockey i just noticed how “minnesota” th...,0,0,Minnesota Wild,MIN,Minnesota Wild,...,Central,West,Rest of League,#MNWild,#wild,http://www.capsinfo.com/images/NHL_Team_Logos/...,#154734,#A6192E,#EAAA00,both
97,1414258128963751937,pinejournal,2021-07-11 16:19:48,A look at who Wild could lose in NHL expansion...,a look at who wild could lose in nhl expansion...,0,0,Minnesota Wild,MIN,Minnesota Wild,...,Central,West,Rest of League,#MNWild,#wild,http://www.capsinfo.com/images/NHL_Team_Logos/...,#154734,#A6192E,#EAAA00,both
98,1414257689513840642,CreaseAndAssist,2021-07-11 16:18:03,#mnwild I wonder when Wild fans will stop tryi...,#mnwild i wonder when wild fans will stop tryi...,0,0,Minnesota Wild,MIN,Minnesota Wild,...,Central,West,Rest of League,#MNWild,#wild,http://www.capsinfo.com/images/NHL_Team_Logos/...,#154734,#A6192E,#EAAA00,both


In [30]:
# Credit: https://jackmckew.dev/sentiment-analysis-text-cleaning-in-python-with-vader.html

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sid_analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text:str, analyser,desired_type:str='pos'):
    # Get sentiment from text
    sentiment_score = analyser.polarity_scores(text)
    return sentiment_score[desired_type]

# Get Sentiment scores
def get_sentiment_scores(df, data_column):
    df[f'positive_score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'pos'))
    df[f'negative_score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'neg'))
    df[f'neutral_score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'neu'))
    df[f'compound_score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'compound'))
    return df


text_sentiment = get_sentiment_scores(df_nhl, 'full_text')

display(text_sentiment.sample(5))

  and should_run_async(code)


Unnamed: 0,id,user,created_at,full_text,clean_text,multiple_teams,no_matches,teams_concat,nhl_team_abbr,nhl_team,...,stopword_fr_ct,hashtag_ct,link_ct,atsign_ct,numeric_ct,uppercase_ct,positive_score,negative_score,neutral_score,compound_score
672,1413192587364737025,4EverBlueshirts,2021-07-08 17:45:43,#NHL what if...\n\nWhat if Jack Eichel has alr...,nhl jack eichel already traded deal agreed fin...,0,1,,Unknown,Unknown,...,0,1,0,0,0,1,0.042,0.0,0.958,0.1406
770,1411138243328946183,xHalfy,2021-07-03 01:42:29,Honestly I just want this nhl season to be ove...,honestly want nhl season get free agency crazy...,0,1,,Unknown,Unknown,...,0,0,0,0,0,1,0.245,0.077,0.677,0.6369
469,1411435370324058118,EveryBrokenWave,2021-07-03 21:23:10,Kraken troll other NHL teams with expansion dr...,kraken troll nhl teams expansion draft looming...,0,0,Seattle Kraken,SEA,Seattle Kraken,...,0,0,1,1,0,1,0.0,0.12,0.88,-0.128
429,1412607473668739078,SeaTimesSports,2021-07-07 03:00:41,It seemed for months as if the Kraken's expans...,seemed months krakens expansion draft picks li...,1,0,"Montreal Canadiens,Seattle Kraken,Tampa Bay Li...",SEA,Seattle Kraken,...,1,0,1,0,0,0,0.038,0.0,0.962,0.1531
128,1411084249294262274,jetcityice,2021-07-02 22:07:56,2021 NHL Expansion Draft ramifications for #Se...,nhl expansion draft ramifications seakraken ne...,1,0,"Minnesota Wild,Seattle Kraken",MIN,Minnesota Wild,...,0,2,1,0,1,3,0.057,0.0,0.943,0.3818


In [31]:
# classify based on VADER readme rules
def sentiment_classifier(df, data_column):

    # create a list of our conditions
    conditions = [
        (df[data_column] >= 0.05),
        (df[data_column] > -0.05) & (df[data_column] < 0.05),
        (df[data_column] <= -0.05),
        ]

    # create a list of the values we want to assign for each condition
    values = ['positive', 'neutral', 'negative']
    
    # apply
    df['sentiment'] = np.select(conditions, values)
    return df

test = sentiment_classifier(df_nhl, 'compound_score')

display(test.sample(5))

  and should_run_async(code)


Unnamed: 0,id,user,created_at,full_text,clean_text,multiple_teams,no_matches,teams_concat,nhl_team_abbr,nhl_team,...,hashtag_ct,link_ct,atsign_ct,numeric_ct,uppercase_ct,positive_score,negative_score,neutral_score,compound_score,sentiment
262,1413634069540134912,walshy66,2021-07-09 23:00:00,"NEW PODCAST: Tampa win, expansion draft, and t...",new podcast tampa win expansion draft tidbits ...,1,0,"Pittsburgh Penguins,Tampa Bay Lightning",PIT,Pittsburgh Penguins,...,2,2,0,0,3,0.226,0.0,0.774,0.5859,positive
149,1411416504042655744,NathanGraviteh,2021-07-03 20:08:12,*NEW* *VIDEO*\n\nTODAY I Predict the ENITRE Se...,new video today predict enitre seattle kraken ...,1,0,"Montreal Canadiens,Seattle Kraken,St Louis Blues",MTL,Montreal Canadiens,...,3,2,0,1,9,0.0,0.0,1.0,0.0,neutral
494,1412441045133103114,StLouisBlues,2021-07-06 15:59:21,The @NHL #ExpansionDraft is almost here! \n\nH...,expansiondraft almost many players seattle sel...,1,0,"Seattle Kraken,St Louis Blues",STL,St Louis Blues,...,2,1,1,0,1,0.097,0.0,0.903,0.5502,positive
409,1412872575970017282,nhl_tradetalk,2021-07-07 20:34:06,GM David Poile says the Nashville Predators ar...,gm david poile says nashville predators lookin...,1,0,"Nashville Predators,Seattle Kraken",SEA,Seattle Kraken,...,0,1,0,0,2,0.0,0.0,1.0,0.0,neutral
69,1411363125538983939,AntGenocide,2021-07-03 16:36:05,Am the only one ready for NHL playoffs to be o...,one ready nhl playoffs get expansion draft off...,0,0,Dallas Stars,DAL,Dallas Stars,...,0,0,0,0,1,0.079,0.0,0.921,0.3612,positive


In [32]:
def print_top_n_reviews(df, data_column, num_rows):
    text = 'full_text'
    for tweet, row in df.nlargest(num_rows, data_column).iterrows():
        print(f"Score: {row[data_column]}, Tweet: {row[text]}")
    return df

res = print_top_n_reviews(text_sentiment , 'positive_score', 5)

res

Score: 0.363, Tweet: Sunday Morning Skate: Happy Fourth of July! https://t.co/qG9KdYsH1m
Score: 0.363, Tweet: Sunday Morning Skate: Happy Fourth of July! https://t.co/GxkMmXdrJw
Score: 0.36, Tweet: @FO_VVerhei I love that the NHL makes the expansion draft so that the team has a chance to be good right away. Excited for Seattle!
Score: 0.355, Tweet: @TSN1200 NHL expansion, draft and free agency.
Score: 0.354, Tweet: Capitals Will Lose a Good Defenseman in Expansion Draft but Gain Salary Cap Space https://t.co/dtgxq7GuIB https://t.co/TZ1PYh1LBS
  and should_run_async(code)


Unnamed: 0,id,user,created_at,full_text,clean_text,multiple_teams,no_matches,teams_concat,nhl_team_abbr,nhl_team,...,hashtag_ct,link_ct,atsign_ct,numeric_ct,uppercase_ct,positive_score,negative_score,neutral_score,compound_score,sentiment
0,1413869071695814658,CBJcoverage,2021-07-10 14:33:49,Friedman: Jack Eichel trade now expected after...,friedman jack eichel trade expected expansion ...,1,0,"Anaheim Ducks,Boston Bruins,Calgary Flames,Los...",ANA,Anaheim Ducks,...,1,1,0,0,1,0.000,0.071,0.929,-0.2960,negative
1,1413732444151422978,NHL_Watcher,2021-07-10 05:30:55,Friedman in 33 Thoughts writes a Jack Eichel t...,friedman thoughts writes jack eichel trade lik...,1,0,"Anaheim Ducks,Boston Bruins,Calgary Flames,Los...",ANA,Anaheim Ducks,...,0,0,0,1,1,0.000,0.000,1.000,0.0000,neutral
2,1411070238150434818,mayorNHL,2021-07-02 21:12:15,(new post) NHL RADIO REPLAY: Mayor’s Minutes\n...,new post nhl radio replay mayor minutes topics...,1,0,"Anaheim Ducks,Los Angeles Kings,Seattle Kraken",ANA,Anaheim Ducks,...,0,2,0,0,7,0.061,0.000,0.939,0.2960,positive
3,1413974338173292550,hockeyaddicts,2021-07-10 21:32:07,"NHL Rumors: Oilers, Blackhawks, Flyers, Sabres...",nhl rumors oilers blackhawks flyers sabres hur...,1,0,"Arizona Coyotes,Buffalo Sabres,Carolina Hurric...",ARZ,Arizona Coyotes,...,0,2,0,0,2,0.000,0.000,1.000,0.0000,neutral
4,1413270173352562690,ClubsCrimson,2021-07-08 22:54:01,NHL Draft Order is set. Buffalo Sabres pick 1s...,nhl draft order set buffalo sabres pick expans...,1,0,"Arizona Coyotes,Buffalo Sabres,Seattle Kraken",ARZ,Arizona Coyotes,...,0,1,1,0,2,0.000,0.080,0.920,-0.5267,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770,1411138243328946183,xHalfy,2021-07-03 01:42:29,Honestly I just want this nhl season to be ove...,honestly want nhl season get free agency crazy...,0,1,,Unknown,Unknown,...,0,0,0,0,1,0.245,0.077,0.677,0.6369,positive
771,1411129913785126913,CEO_Kehoe,2021-07-03 01:09:23,@NHL_Watcher Nah. They're just sad to see thei...,nah theyre sad see dude go cant continue shit ...,0,1,,Unknown,Unknown,...,0,0,1,0,0,0.071,0.109,0.821,-0.1471,negative
772,1411120522579095553,jacobrotenberg,2021-07-03 00:32:04,Monday will be the last game of the NHL season...,monday last game nhl season onto expansion dra...,0,1,,Unknown,Unknown,...,1,0,0,0,1,0.148,0.000,0.852,0.5106,positive
773,1411111522496135170,AlexLanglois15,2021-07-02 23:56:18,"@MTBO86247553 No worries here at all, the play...",worries playoffs arent even yet lol wait till ...,0,1,,Unknown,Unknown,...,0,0,1,0,1,0.204,0.000,0.796,0.6288,positive


In [33]:
top_tweet_list = []

def print_top_n_reviews(df, data_column, num_rows):
    text = 'full_text'
    top = df.nlargest(num_rows, data_column)
    top_tweets = df[[data_column,text]]
    return top_tweets

res = print_top_n_reviews(text_sentiment , 'positive_score', 5)

res

  and should_run_async(code)


Unnamed: 0,positive_score,full_text
0,0.000,Friedman: Jack Eichel trade now expected after...
1,0.000,Friedman in 33 Thoughts writes a Jack Eichel t...
2,0.061,(new post) NHL RADIO REPLAY: Mayor’s Minutes\n...
3,0.000,"NHL Rumors: Oilers, Blackhawks, Flyers, Sabres..."
4,0.000,NHL Draft Order is set. Buffalo Sabres pick 1s...
...,...,...
770,0.245,Honestly I just want this nhl season to be ove...
771,0.071,@NHL_Watcher Nah. They're just sad to see thei...
772,0.148,Monday will be the last game of the NHL season...
773,0.204,"@MTBO86247553 No worries here at all, the play..."


In [34]:
num_rows = 5
data_column = 'positive_score'
text = 'full_text'
df = text_sentiment
top = df.nlargest(num_rows, data_column)
top_tweets = top[[data_column,text]].reset_index()
top_tweets = top_tweets.drop(columns = 'index')
top_tweets.index = top_tweets.index + 1 
top_tweets.head(5)


  and should_run_async(code)


Unnamed: 0,positive_score,full_text
1,0.363,Sunday Morning Skate: Happy Fourth of July! ht...
2,0.363,Sunday Morning Skate: Happy Fourth of July! ht...
3,0.36,@FO_VVerhei I love that the NHL makes the expa...
4,0.355,"@TSN1200 NHL expansion, draft and free agency."
5,0.354,Capitals Will Lose a Good Defenseman in Expans...


In [35]:
sent_type = 'Positive'
   
if sent_type == 'Positive':
    sent_nm= 'positive_score'

# Scenario 2: Bigrams
if sent_type== 'Neutral':
    sent_nm == 'neutral_score'

# Scenario 3: Trigrams
if sent_type == 'Negative':
    sent_nm == 'negative_score'
    
text = 'full_text'
top = df.nlargest(num_rows, sent_nm)
top_tweets = top[[sent_nm,text]]

  and should_run_async(code)


## Part 6: Data & Text Cleaning
* Change to lower case
* Remove punctuation, stopwords, URLs, html tags, emojis, emoticons
* Spell correction
* Explore & remove custom stopwords

In [None]:
# Create a copy to preserve the raw data
df_tweets = df_tweets_raw.copy()

# Sample the tweets
df_tweets_raw.full_text[26]

In [None]:
# View more sample tweets
for i in range(0,5):
    print(str(i+1) + ') ' + df_tweets_raw.full_text[i] + '\n')

## Step 6a: Convert tweets to lower case

In [None]:
# Convert transcripts to lower case
df_tweets['text_cleaned'] = df_tweets['full_text'].map(lambda x: x.lower())

# Print out sample of cleaned transcripts
print("Cleaned tweet:\n" + df_tweets['text_cleaned'][26])

## Step 6b: Remove URLs, html tags, punctuation, stopwords, emojis, emoticons

In [None]:
# Remove tweet URLs
df_tweets['text_cleaned'] = df_tweets.text_cleaned.map(lambda x: re.sub(r'https\:\/\/t\.co\/*\w*', '', x, flags=re.MULTILINE).strip())

# Print out sample of cleaned transcripts
print("Cleaned tweet:\n" + df_tweets['text_cleaned'][26])

In [None]:
# Remove punctuation
df_tweets['text_cleaned'] = df_tweets.text_cleaned.str.replace('[^\w\s]','')

# Print out sample of cleaned transcripts
print("Cleaned tweet:\n" + df_tweets['text_cleaned'][26])

In [None]:
# Remove stopwords (start with library to identify stopwords)
from nltk.corpus import stopwords

# Define english & french stopwords 
stop_en = stopwords.words('english')
stop_fr = stopwords.words('french')

# Remove english & french stopwords
df_tweets['text_cleaned'] = df_tweets.text_cleaned.apply(lambda x: " ".join(x for x in x.split() if x not in stop_en))
df_tweets['text_cleaned'] = df_tweets.text_cleaned.apply(lambda x: " ".join(x for x in x.split() if x not in stop_fr))

# Print out sample of cleaned transcripts
print("Cleaned tweet:\n" + df_tweets['text_cleaned'][26])

In [None]:
# Define function to remove emojis --> e.g. 😜
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Remove all emojis
df_tweets['text_cleaned'] = df_tweets.text_cleaned.apply(lambda x: remove_emoji(x))

# Print out sample of cleaned transcripts
print("Cleaned tweet:\n" + df_tweets['text_cleaned'][26])

In [None]:
# Define function to remove emoticons --> e.g. :-)

# Libraries
!pip install emot
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

# Function for removing emoticons
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

# Remove all emoticons
df_tweets['text_cleaned'] = df_tweets.text_cleaned.apply(lambda x: remove_emoticons(x))

# Print out sample of cleaned transcripts
print("Cleaned tweet:\n" + df_tweets['text_cleaned'][26])

## Step 6c: Tokenize and lemmatize tweets
Tokenization parses tweets into individual words and lemmatization removes inflectional endings (ie. endings that add grammatical meaning). These methods prepare the data for word frequency and n-gram analysis.

Methods used below reference this post: https://towardsdatascience.com/from-dataframe-to-n-grams-e34e29df3460

In [None]:
def clean_text_round1(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) # remove emoji
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = ' ' + text # added space because there was some weirdness for first word (strip later)
    text = text.lower() # convert all text to lowercase
    text = re.sub(r'(\s)@\w+', '', text) # remove whole word if starts with @
    text = re.sub(r'(\s)\w*\d\w*\w+', '', text) # remove whole word if starts with number
    text = re.sub(r'https\:\/\/t\.co\/*\w*', '', text) # remove https links
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # removes punctuation
    text = re.sub('\[.*?\]', '', text) # removes text in square brackets
    #text = re.sub('\w*\d\w*', '', text) # remove whole word if starts with number
    #text = re.sub(r'(\s)#\w+', '', text) # remove whole word if starts with #
    text = text.strip()
    return text

round1 = lambda x: clean_text_round1(x)

df_tweets_raw['clean_text'] = df_tweets_raw.full_text.apply(round1)
print("Cleaned tweet:\n" + df_tweets_raw['clean_text'][4])


In [None]:
def round3_text_clean(text):
    #TODO: add emoticons and emojis to this!
    # Load in stopwords
    stopwords_en = nltk.corpus.stopwords.words('english')
    stopwords_fr = nltk.corpus.stopwords.words('french')
    stopwords = stopwords_en + stopwords_fr
    # Create pre-clean character count feature
    text = text.apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))
    return text

df3 = round3_text_clean(df_tweets_raw.full_text)

df3.head(4)

## Step 6d: Word correction (needs work & review!)

In [None]:
#TODO Check to see if this worked as intended...

# Spell correction 
from textblob import TextBlob
df_tweets['text_cleaned'] = df_tweets.text_cleaned.[:5].apply(lambda x: str(TextBlob(x).correct()))

# Step 7: Text analytics (analyze tweets)

## Step 7a: Word frequencies and n-grams

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
import unicodedata


def text_clean_round2(text):
    """
    A simple function to clean up the data. All the words that
    are not designated as a stop word is then lemmatized after
    encoding and basic regex parsing are performed.
    """
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')
    text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore'))
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]


# Word cloud
txt = text_clean_round2(''.join(str(df_tweets.text_cleaned.tolist())))


# Python program to convert a list
# to string using join() function
    
# Function to convert  
def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))
        
        
# Driver code    
print(listToString(txt)) 




In [None]:
def tweets_ngrams(n, top_n, df):
    text = df.text_cleaned
    words = text_clean_round2(''.join(str(text.tolist())))
    result = (pd.Series(data = nltk.ngrams(words, n), name = 'freq').value_counts())[:top_n]
    return result.to_frame()

word_series = tweets_ngrams(1, 2, df_tweets)
bigram_series = tweets_ngrams(2, 2, df_tweets)
trigram_series = tweets_ngrams(3, 2, df_tweets)

word_series['ngram'] = 'unigram'
bigram_series['ngram'] = 'bigram'
trigram_series['ngram'] = 'trigram'

rez = word_series.append([bigram_series, trigram_series])
rez['ngram_nm'] = rez.index


rez.head(20)

In [None]:
def all_ngrams(top_n, df):
    text = df.text_cleaned
    words = text_clean_round2(''.join(str(text.tolist())))
    unigram = ((pd.Series(data = nltk.ngrams(words, 1), name = 'freq').value_counts())[:top_n]).to_frame()
    unigram['ngram'] = 'unigram'
    bigram = ((pd.Series(data = nltk.ngrams(words, 2), name = 'freq').value_counts())[:top_n]).to_frame()
    bigram['ngram'] = 'bigram'
    trigram = ((pd.Series(data = nltk.ngrams(words, 3), name = 'freq').value_counts())[:top_n]).to_frame()
    trigram['ngram'] = 'trigram'
    result = unigram.append([bigram, trigram])
    result['ngram_nm'] = result.index
    return result

rezz = all_ngrams(3, df_tweets)


rez.head(20)

In [None]:
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
# Helper function
def plot_30_most_common_words(count_data, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:20]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='30 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl', orient = 'h')
    plt.yticks(x_pos, words) 
    plt.ylabel('words')
    plt.xlabel('counts')
    plt.show()

# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed transcripts
count_data = count_vectorizer.fit_transform(df_tweets['text_cleaned'])

# Visualise the 30 most common words
plot_30_most_common_words(count_data, count_vectorizer)

## Step 7b: Sentiment analysis

In [86]:
# Function 8
#----------------

# Credit: https://jackmckew.dev/sentiment-analysis-text-cleaning-in-python-with-vader.html
sid_analyzer = SentimentIntensityAnalyzer()

# Get sentiment
def get_sentiment(text:str, analyser, desired_type:str='pos'):
    # Get sentiment from text
    sentiment_score = analyser.polarity_scores(text)
    return sentiment_score[desired_type]

# Get Sentiment scores
def get_sentiment_scores(df, data_column):
    df[f'positive_score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'pos'))
    df[f'negative_score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'neg'))
    df[f'neutral_score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'neu'))
    df[f'compound_score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'compound'))
    return df

# Function 9
#----------------
# Credit: https://www.dataquest.io/blog/tutorial-add-column-pandas-dataframe-based-on-if-else-condition/

# classify based on VADER readme rules
def sentiment_classifier(df, data_column):

    # create a list of our conditions
    conditions = [
        (df[data_column] >= 0.05),
        (df[data_column] > -0.05) & (df[data_column] < 0.05),
        (df[data_column] <= -0.05),
        ]

    # create a list of the values we want to assign for each condition
    values = ['Positive', 'Neutral', 'Negative']
    
    # apply
    df['sentiment'] = np.select(condlist = conditions, choicelist = values)
    return df


  and should_run_async(code)


In [97]:
# Get sentiment scores on raw tweets
text_sentiment = get_sentiment_scores(df_nhl, 'full_text')

# Add sentiment classification
text_sentiment = sentiment_classifier(df_nhl, 'compound_score')

# Select columns to output
df_sentiment = text_sentiment[['id', 'created_at', 'nhl_team_abbr', 'nhl_team', 'expansion_type', 'full_text', 'sentiment', 'positive_score', 'negative_score', 'neutral_score', 'compound_score']]

# Sentiment group dataframe
sentiment_group = df_sentiment.groupby(['sentiment']).agg({'id': 'nunique'}).reset_index()
expansion_group = df_sentiment.groupby(['expansion_type', 'sentiment']).agg({'id': 'nunique', 'compound_score': ['mean', 'median', 'min', 'max']}).reset_index(level=[0,1])
team_group = df_sentiment.groupby(['nhl_team_abbr', 'nhl_team']).agg({'id': 'nunique', 'compound_score': ['mean', 'median', 'min', 'max']}).reset_index(level=[0,1])
team_group.columns = ['nhl_team_abbr', 'nhl_team', 'tweets', 'avg_compound_score', 'median_compound_score', 'min_compound_score', 'max_compound_score']
expansion_group.columns = ['expansion_type', 'sentiment', 'tweets', 'avg_compound_score', 'median_compound_score', 'min_compound_score', 'max_compound_score']

expansion_group2 = df_sentiment.groupby(['expansion_type']).agg({'id': 'nunique', 'compound_score': ['mean', 'median', 'min', 'max']}).reset_index(level=[0,0])
expansion_group2.columns = ['expansion_type',  'tweets', 'avg_compound_score', 'median_compound_score', 'min_compound_score', 'max_compound_score']

#unknown_group = unknown_group.loc[unknown_group['nhl_team_abbr'] == 'Unknown']


# rename
sentiment_group.rename(columns={"id": "tweets"}, inplace = True)
expansion_group.rename(columns={"id": "tweets"}, inplace = True)
team_group.rename(columns={"id": "tweets"}, inplace = True)


#team_group.index.to_flat_index()
expansion_group2.head(10)
#team_group.columns

  and should_run_async(code)


Unnamed: 0,expansion_type,tweets,avg_compound_score,median_compound_score,min_compound_score,max_compound_score
0,Kraken,214,0.210047,0.0,-0.7339,0.9704
1,Rest of League,306,0.177417,0.0,-0.8074,0.926
2,Unknown,186,0.209221,0.08995,-0.8555,0.9481
