# Sentiment Analysis in Python

# 1. Sentiment Analysis Nuts and Bolts

## Welcome!

In [None]:
# Find the number of positive and negative reviews
print('Number of positive and negative reviews: ', movies.label.value_counts())

# Find the proportion of positive and negative reviews
print('Proportion of positive and negative reviews: ', movies.label.value_counts() / len(movies))

"""
Number of positive and negative reviews:  0    530
    1    470
    Name: label, dtype: int64
    Proportion of positive and negative reviews:  0    0.53
    1    0.47
    Name: label, dtype: float64
"""

### Longest and shortest reviews

In [None]:
length_reviews = movies.review.str.len()

# How long is the shortest review
print(min(length_reviews))

# 53

## Sentiment analysis types and approaches

### Detecting the sentiment of Tale of Two Cities

In [None]:
# Import the required packages
from textblob import TextBlob

# Create a textblob object 
blob_two_cities = TextBlob(two_cities)

# Print out the sentiment   
print(blob_two_cities.sentiment)

# Sentiment(polarity=0.022916666666666658, subjectivity=0.5895833333333332)

### Comparing the sentiment of two strings

In [None]:
# Import the required packages
from textblob import TextBlob

# Create a textblob object 
blob_annak = TextBlob(annak)
blob_catcher = TextBlob(catcher)

# Print out the sentiment   
print('Sentiment of annak: ', blob_annak.sentiment)
print('Sentiment of catcher: ', blob_catcher.sentiment)

# Sentiment of annak:  Sentiment(polarity=0.05000000000000002, subjectivity=0.95)
# Sentiment of catcher:  Sentiment(polarity=-0.05, subjectivity=0.5466666666666666)

### What is the sentiment of a movie review?

In [None]:
# Import the required packages
from textblob import TextBlob

# Create a textblob object 
blob_titanic = TextBlob(titanic)

# Print out its sentiment
print(blob_titanic.sentiment)

# Sentiment(polarity=0.2024748060772906, subjectivity=0.4518248900857597)

## Let's build a word cloud!

### Your first word cloud

In [None]:
from wordcloud import WordCloud

# Generate the word cloud from the east_of_eden string
cloud_east_of_eden = WordCloud(background_color="white").generate(east_of_eden)

# Create a figure of the generated cloud
plt.imshow(cloud_east_of_eden, interpolation='bilinear')  
plt.axis('off')
# Display the figure
plt.show()

### Word Cloud on movie reviews

In [None]:
# Import the word cloud function 
from wordcloud import WordCloud 

# Create and generate a word cloud image
my_cloud = WordCloud(background_color='white', stopwords=my_stopwords).generate(descriptions)

# Display the generated wordcloud image
plt.imshow(my_cloud, interpolation='bilinear') 
plt.axis("off")

# Don't forget to show the final image
plt.show()

# 2. Numeric Features from Reviews

## Bag-of-words

### Your first BOW

In [None]:
# Import the required function
from sklearn.feature_extraction.text import CountVectorizer 

annak = ['Happy families are all alike;', 'every unhappy family is unhappy in its own way']

# Build the vectorizer and fit it
anna_vect = CountVectorizer()
anna_vect.fit(annak)

# Create the bow representation
anna_bow = anna_vect.transform(annak)

# Print the bag-of-words result 
print(anna_bow.toarray())

"""
[[1 1 1 0 1 0 1 0 0 0 0 0 0]
[0 0 0 1 0 1 0 1 1 1 1 2 1]]
"""

### BOW using product reviews

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 

# Build the vectorizer, specify max features 
vect = CountVectorizer(max_features=100)
# Fit the vectorizer
vect.fit(reviews.review)

# Transform the review column
X_review = vect.transform(reviews.review)

# Create the bow representation
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names())
print(X_df.head())


"""
       about  after  all  also  am  ...  with  work  would  you  your
    0      0      0    1     0   0  ...     1     0      2    0     1
    1      0      0    0     0   0  ...     0     0      1    1     0
    2      0      0    3     0   0  ...     0     1      1    2     0
    3      0      0    0     0   0  ...     0     0      0    0     0
    4      0      1    0     0   0  ...     0     0      0    3     1
    
    [5 rows x 100 columns]
"""

## Getting granular with n-grams

### Specify token sequence length with BOW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 

# Build the vectorizer, specify token sequence and fit
vect = CountVectorizer(ngram_range=(1, 2))
vect.fit(reviews.review)

# Transform the review column
X_review = vect.transform(reviews.review)

# Create the bow representation
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names())
print(X_df.head())


""" 
       10  10 95  10 cups  100  100 years  ...  zelbessdisk  zelbessdisk three  zen  zen baseball  zen motorcycle
    0   0      0        0    0          0  ...            0                  0    0             0               0
    1   0      0        0    0          0  ...            0                  0    0             0               0
    2   0      0        0    0          0  ...            0                  0    0             0               0
    3   0      0        0    0          0  ...            1                  1    0             0               0
    4   0      0        0    0          0  ...            0                  0    0             0               0
    
    [5 rows x 8436 columns]
"""

### Size of vocabulary of movies reviews

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 

# Build the vectorizer, specify size of vocabulary and fit
vect = CountVectorizer(max_features=100)
vect.fit(movies.review)

# Transform the review column
X_review = vect.transform(movies.review)
# Create the bow representation
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names())
print(X_df.head())


"""
       about  all  also  an  and  ...  who  will  with  would  you
    0      0    0     0   0    1  ...    0     0     1      1    0
    1      0    3     1   1   11  ...    0     2     7      2    3
    2      0    0     0   1    7  ...    0     0     2      0    0
    3      0    0     0   2    1  ...    1     0     0      0    1
    4      0    3     0   0    8  ...    0     0     2      0    0
    
    [5 rows x 100 columns]
"""

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 

# Build and fit the vectorizer
vect = CountVectorizer(max_df=200)
vect.fit(movies.review)

# Transform the review column
X_review = vect.transform(movies.review)
# Create the bow representation
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names())
print(X_df.head())

"""
       00  000  000s  007  00s  ...  zooms  zsigmond  zulu  zuniga  zvyagvatsev
    0   0    0     0    0    0  ...      0         0     0       0            0
    1   0    0     0    0    0  ...      0         0     0       0            0
    2   0    0     0    0    0  ...      0         0     0       0            0
    3   0    0     0    0    0  ...      0         0     0       0            0
    4   0    0     0    0    0  ...      0         0     0       0            0
    
    [5 rows x 17669 columns]
"""

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 

# Build and fit the vectorizer
vect = CountVectorizer(min_df=50)
vect.fit(movies.review)

# Transform the review column
X_review = vect.transform(movies.review)
# Create the bow representation
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names())
print(X_df.head())

"""
       10  about  absolutely  acting  action  ...  yes  yet  you  young  your
    0   0      0           0       0       0  ...    0    0    0      0     0
    1   1      0           0       1       0  ...    0    1    3      0     2
    2   0      0           0       0       0  ...    0    0    0      1     0
    3   0      0           0       0       1  ...    0    0    1      1     0
    4   1      0           0       0       1  ...    0    0    0      0     0
    
    [5 rows x 434 columns]
"""

### BOW with n-grams and vocabulary size

In [None]:
# Import the vectorizer
from sklearn.feature_extraction.text import CountVectorizer 

# Build the vectorizer, specify max features and fit
vect = CountVectorizer(max_features=1000, ngram_range=(2, 2), max_df=500)
vect.fit(reviews.review)

# Transform the review
X_review = vect.transform(reviews.review)

# Create a DataFrame from the bow representation
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names())
print(X_df.head())

"""
       1980 style  aa batteries  aaa batteries  able to  about the  ...  you want  you will  your imagination  your money  yr old
    0           0             0              0        0          0  ...         0         0                 0           0       0
    1           0             0              0        0          0  ...         0         0                 0           0       0
    2           0             0              0        0          0  ...         0         0                 0           0       0
    3           0             0              0        0          0  ...         0         0                 0           0       0
    4           0             0              0        0          0  ...         0         0                 0           0       0
    
    [5 rows x 1000 columns]
"""

## Build new features from text

### Tokenize a string from GoT

In [None]:
# Import the required function
from nltk import word_tokenize

# Transform the GoT string to word tokens
print(word_tokenize(GoT))

"""
['Never', 'forget', 'what', 'you', 'are', ',', 'for', 'surely', 'the', 'world', 'will', 'not', '.', 'Make', 'it', 'your', 
'strength', '.', 'Then', 'it', 'can', 'never', 'be', 'your', 'weakness', '.', 'Armour', 'yourself', 'in', 'it', ',', 'and',
'it', 'will', 'never', 'be', 'used', 'to', 'hurt', 'you', '.']
"""

### Word tokens from the Avengers

In [None]:
# Import the word tokenizing function
from nltk import word_tokenize

# Tokenize each item in the avengers 
tokens_avengers = [word_tokenize(item) for item in avengers]

print(tokens_avengers)

"""
[['Cause', 'if', 'we', 'ca', "n't", 'protect', 'the', 'Earth', ',', 'you', 'can', 'be', 'd', '*', '*', '*',
'sure', 'we', "'ll", 'avenge', 'it'], ['There', 'was', 'an', 'idea', 'to', 'bring', 'together', 'a', 'group', 
'of', 'remarkable', 'people', ',', 'to', 'see', 'if', 'we', 'could', 'become', 'something', 'more'], ['These', 
'guys', 'come', 'from', 'legend', ',', 'Captain', '.', 'They', "'re", 'basically', 'Gods', '.']]
"""

### A feature for the length of a review

In [None]:
# Import the needed packages
from nltk import word_tokenize

# Tokenize each item in the review column
word_tokens = [word_tokenize(review) for review in reviews.review]

# Print out the first item of the word_tokens list
print(word_tokens[0])

"""
['Stuning', 'even', 'for', 'the', 'non-gamer', ':', 'This', 'sound', 'track', 'was', 'beautiful', '!', 'It', 'paints', 
'the', 'senery', 'in', 'your', 'mind', 'so', 'well', 'I', 'would', 'recomend', 'it', 'even', 'to', 'people', 'who', 
'hate', 'vid', '.', 'game', 'music', '!', 'I', 'have', 'played', 'the', 'game', 'Chrono', 'Cross', 'but', 'out', 'of', 
'all', 'of', 'the', 'games', 'I', 'have', 'ever', 'played', 'it', 'has', 'the', 'best', 'music', '!', 'It', 'backs', 
'away', 'from', 'crude', 'keyboarding', 'and', 'takes', 'a', 'fresher', 'step', 'with', 'grate', 'guitars', 'and', 
'soulful', 'orchestras', '.', 'It', 'would', 'impress', 'anyone', 'who', 'cares', 'to', 'listen', '!', '^_^']
"""

In [None]:
# Create an empty list to store the length of the reviews
len_tokens = []

# Iterate over the word_tokens list and determine the length of each item
for i in range(len(word_tokens)):
     len_tokens.append(len(word_tokens[i]))

# Create a new feature for the lengh of each review
reviews['n_words'] = len_tokens 

## Can you guess the language?

### Identify the language of a string

In [None]:
# Import the language detection function and package
from langdetect import detect_langs

# Detect the language of the foreign string
print(detect_langs(foreign))

# [fr:0.9999972065813575]

### Detect language of a list of strings

In [None]:
from langdetect import detect_langs

languages = []

# Loop over the sentences in the list and detect their language
for sentence in sentences:
    languages.append(detect_langs(sentence))
    
print('The detected languages are: ', languages)

"""
["L'histoire rendu était fidèle, excellent, et grande.", 'Excelente muy recomendable.', 
'It had a leak from day one but the return and exchange process was very quick.']


The detected languages are:  [[fr:0.9999970274265436], [es:0.9999954111285407], [en:0.999997772943068]]
"""

### Language detection of product reviews

In [None]:
from langdetect import detect_langs
languages = [] 

# Loop over the rows of the dataset and append  
for row in range(len(non_english_reviews)):
    languages.append(detect_langs(non_english_reviews.iloc[row, 1]))

# Clean the list by splitting     
languages = [str(lang).split(':')[0][1:] for lang in languages]

# Assign the list to a new feature 
non_english_reviews['language'] = languages

print(non_english_reviews.head())

"""
          score                                             review language
    1249      1   Il grande ritorno!: E' dai tempi del tour di ...       it
    1259      1   La reencarnación vista por un científico: El ...       es
    1260      1   Excelente Libro / Amazing book!!: Este libro ...       es
    1261      1   Magnifico libro: Brian Weiss ha dejado una ma...       es
    1639      1   El libro mas completo que existe para nosotra...       es
"""

# 3. More on Numeric Vectors: Transforming Tweets

## Stop words

### Word cloud of tweets

In [None]:
# Import the word cloud function 
from wordcloud import WordCloud 

# Create and generate a word cloud image
my_cloud = WordCloud(background_color='white').generate(text_tweet)

# Display the generated wordcloud image
plt.imshow(my_cloud, interpolation='bilinear') 
plt.axis("off")

# Don't forget to show the final image
plt.show()

In [None]:
# Import the word cloud function and stop words list
from wordcloud import WordCloud, STOPWORDS 

# Define the list of stopwords
my_stop_words = STOPWORDS.update(['airline', 'airplane'])

# Create and generate a word cloud image
my_cloud = WordCloud(stopwords=my_stop_words).generate(text_tweet)

# Display the generated wordcloud image
plt.imshow(my_cloud, interpolation='bilinear') 
plt.axis("off")
# Don't forget to show the final image
plt.show()

### Airline sentiment with stop words

In [None]:
# Import the stop words
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS 

# Define the stop words
my_stop_words = ENGLISH_STOP_WORDS.union(['airline', 'airlines', '@'])

# Build and fit the vectorizer
vect = CountVectorizer(stop_words=my_stop_words)
vect.fit(tweets.text)

# Create the bow representation
X_review = vect.transform(tweets.text)
# Create the data frame
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names())
print(X_df.head())


"""
       00  000  000114  000419  0011  ...  zero  zfqmpgxvs6  zone  zsuztnaijq  zv2pt6trk9
    0   0    0       0       0     0  ...     0           0     0           0           0
    1   0    0       0       0     0  ...     0           0     0           0           0
    2   0    0       0       0     0  ...     0           0     0           0           0
    3   0    0       0       0     0  ...     0           0     0           0           0
    4   0    0       0       0     0  ...     0           0     0           0           0
    
    [5 rows x 2867 columns]
"""

### Multiple text columns

In [None]:
# Import the vectorizer and default English stop words list
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS 

# Define the stop words
my_stop_words = ENGLISH_STOP_WORDS.union(['airline', 'airlines', '@', 'am', 'pm'])
 
# Build and fit the vectorizers
vect1 = CountVectorizer(stop_words=my_stop_words)
vect2 = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)
vect1.fit(tweets.text)
vect2.fit(tweets.negative_reason)

# Print the last 15 features from the first, and all from second vectorizer
print(vect1.get_feature_names()[-15:])
print(vect2.get_feature_names())


"""
    ['yesterday', 'yo', 'york', 'youcouldntmakethis', 'yr', 'ywg', 'yxe', 'yyj', 'yyz', 'zambia', 'zcbjyo6lsn', 'zcc82u', 
    'zero', 'zfqmpgxvs6', 'zone']
    
    ['attendant', 'bad', 'booking', 'cancelled', 'complaints', 'customer', 'damaged', 'flight', 'issue', 'late', 'longlines',
    'lost', 'luggage', 'problems', 'service', 'tell']
"""

## Capturing a token pattern

### Specify the token pattern

In [None]:
# Build and fit the vectorizer
vect = CountVectorizer(token_pattern=r'\b[^\d\W][^\d\W]+\b').fit(tweets.text)
vect.transform(tweets.text)
print('Length of vectorizer: ', len(vect.get_feature_names()))

# Length of vectorizer:  2770

In [None]:
# Build the first vectorizer
vect1 = CountVectorizer().fit(tweets.text)
vect1.transform(tweets.text)

# Build the second vectorizer
vect2 = CountVectorizer(token_pattern=r'\b[^\d\W][^\d\W]').fit(tweets.text)
vect2.transform(tweets.text)

# Print out the length of each vectorizer
print('Length of vectorizer 1: ', len(vect1.get_feature_names()))
print('Length of vectorizer 2: ', len(vect2.get_feature_names()))

"""
    Length of vectorizer 1:  3081
    Length of vectorizer 2:  332
"""

### String operators with the Twitter data

In [None]:
# Import the word tokenizing package
from nltk import word_tokenize

# Tokenize the text column
word_tokens = [word_tokenize(review) for review in tweets.text]
print('Original tokens: ', word_tokens[0])

# Filter out non-letter characters
cleaned_tokens = [[word for word in item if word.isalpha()] for item in word_tokens]
print('Cleaned tokens: ', cleaned_tokens[0])

"""
    Original tokens:  ['@', 'VirginAmerica', 'What', '@', 'dhepburn', 'said', '.']
    Cleaned tokens:  ['VirginAmerica', 'What', 'dhepburn', 'said']
"""

### More string operators and Twitter

In [None]:
# Create a list of lists, containing the tokens from list_tweets
tokens = [word_tokenize(item) for item in tweets_list]

# Remove characters and digits , i.e. retain only letters
letters = [[word for word in item if word.isalpha()] for item in tokens]
# Remove characters, i.e. retain only letters and digits
let_digits = [[word for word in item if word.isalnum()] for item in tokens]
# Remove letters and characters, retain only digits
digits = [[word for word in item if word.isdigit()] for item in tokens]

# Print the last item in each list
print('Last item in alphabetic list: ', letters[2])
print('Last item in list of alphanumerics: ', let_digits[2])
print('Last item in the list of digits: ', digits[2])

"""
["@VirginAmerica it's really aggressive to blast obnoxious 'entertainment' in your guests' faces &amp; they have little 
recourse", "@VirginAmerica Hey, first time flyer next week - excited! But I'm having a hard time getting my flights added to 
my Elevate account. Help?", '@united Change made in just over 3 hours. For something that should have taken seconds online, 
I am not thrilled. Loved the agent, though.']


Last item in alphabetic list:  ['united', 'Change', 'made', 'in', 'just', 'over', 'hours', 'For', 'something', 'that', 'should',
'have', 'taken', 'seconds', 'online', 'I', 'am', 'not', 'thrilled', 'Loved', 'the', 'agent', 'though']

Last item in list of alphanumerics:  ['united', 'Change', 'made', 'in', 'just', 'over', '3', 'hours', 'For', 'something','that',
'should', 'have', 'taken', 'seconds', 'online', 'I', 'am', 'not', 'thrilled', 'Loved', 'the', 'agent', 'though']

Last item in the list of digits:  ['3']

"""

## Stemming and lemmatization

### Stems and lemmas from GoT

In [None]:
# Import the required packages from nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize

porter = PorterStemmer()
WNlemmatizer = WordNetLemmatizer()

# Tokenize the GoT string
tokens = word_tokenize(GoT) 

In [None]:
import time

# Log the start time
start_time = time.time()

# Build a stemmed list
stemmed_tokens = [porter.stem(token) for token in tokens]

# Log the end time
end_time = time.time()

print('Time taken for stemming in seconds: ', end_time - start_time)
print('Stemmed tokens: ', stemmed_tokens) 

"""
Time taken for stemming in seconds:  0.0008411407470703125

Stemmed tokens:  ['never', 'forget', 'what', 'you', 'are', ',', 'for', 'sure', 'the', 'world', 'will', 'not', '.', 'make', 
'it', 'your', 'strength', '.', 'then', 'it', 'can', 'never', 'be', 'your', 'weak', '.', 'armour', 'yourself', 'in', 'it', ',', 
'and', 'it', 'will', 'never', 'be', 'use', 'to', 'hurt', 'you', '.']
"""

In [None]:
import time

# Log the start time
start_time = time.time()

# Build a lemmatized list
lem_tokens = [WNlemmatizer.lemmatize(token) for token in tokens]

# Log the end time
end_time = time.time()

print('Time taken for lemmatizing in seconds: ', end_time - start_time)
print('Lemmatized tokens: ', lem_tokens) 

"""

Time taken for lemmatizing in seconds:  1.321437120437622

Lemmatized tokens:  ['Never', 'forget', 'what', 'you', 'are', ',', 'for', 'surely', 'the', 'world', 'will', 'not', '.', 
'Make', 'it', 'your', 'strength', '.', 'Then', 'it', 'can', 'never', 'be', 'your', 'weakness', '.', 'Armour', 'yourself',
'in', 'it', ',', 'and', 'it', 'will', 'never', 'be', 'used', 'to', 'hurt', 'you', '.']
"""

### Stem Spanish reviews

In [None]:
# Import the language detection package
import langdetect

# Loop over the rows of the dataset and append  
languages = [] 
for i in range(len(non_english_reviews)):
    languages.append(langdetect.detect_langs(non_english_reviews.iloc[i, 1]))

# Clean the list by splitting     
languages = [str(lang).split(':')[0][1:] for lang in languages]
# Assign the list to a new feature 
non_english_reviews['language'] = languages

# Select the Spanish ones
filtered_reviews = non_english_reviews[non_english_reviews.language == 'es']

In [None]:
# Import the required packages
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize

# Import the Spanish SnowballStemmer
SpanishStemmer = SnowballStemmer("spanish")

# Create a list of tokens
tokens = [word_tokenize(review) for review in filtered_reviews.review] 
# Stem the list of tokens
stemmed_tokens = [[SpanishStemmer.stem(word) for word in token] for token in tokens]

# Print the first item of the stemmed tokenss
print(stemmed_tokens[0])

"""
['la', 'reencarn', 'vist', 'por', 'un', 'cientif', ':', 'el', 'prim', 'libr', 'del', 'dr.', 'weiss', 'sig', 'siend', 'un', 
'gran', 'libr', 'par', 'tod', 'aquell', 'a', 'quien', 'les', 'inquiet', 'el', 'tem', 'de', 'la', 'reencarn', ',', 'asi', 
'no', 'cre', 'en', 'ella', '.']
"""

### Stems from tweets

In [None]:
# Import the function to perform stemming
from nltk.stem import PorterStemmer
from nltk import word_tokenize

# Call the stemmer
porter = PorterStemmer()

# Transform the array of tweets to tokens
tokens = [word_tokenize(tweet) for tweet in tweets]
# Stem the list of tokens
stemmed_tokens = [[porter.stem(word) for word in tweet] for tweet in tokens] 
# Print the first element of the list
print(stemmed_tokens[0])

# ['@', 'virginamerica', 'what', '@', 'dhepburn', 'said', '.']

## TfIdf: More ways to transform text

### Your first TfIdf

In [None]:
# Import the required function
from sklearn.feature_extraction.text import TfidfVectorizer

annak = ['Happy families are all alike;', 'every unhappy family is unhappy in its own way']

# Call the vectorizer and fit it
anna_vect = TfidfVectorizer().fit(annak)

# Create the tfidf representation
anna_tfidf = anna_vect.transform(annak)

# Print the result 
print(anna_tfidf.toarray())

"""
    [[0.4472136  0.4472136  0.4472136  0.         0.4472136  0.
      0.4472136  0.         0.         0.         0.         0.
      0.        ]
     [0.         0.         0.         0.30151134 0.         0.30151134
      0.         0.30151134 0.30151134 0.30151134 0.30151134 0.60302269
      0.30151134]]
"""

### TfIdf on Twitter airline sentiment data

In [None]:
# Import the required vectorizer package and stop words list
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

# Define the vectorizer and specify the arguments
my_pattern = r'\b[^\d\W][^\d\W]+\b'
vect = TfidfVectorizer(ngram_range=(1,2), max_features=100, token_pattern=my_pattern, stop_words=ENGLISH_STOP_WORDS).fit(tweets.text)

# Transform the vectorizer
X_txt = vect.transform(tweets.text)
 
# Transform to a data frame and specify the column names
X=pd.DataFrame(X_txt.toarray(), columns=vect.get_feature_names())
print('Top 5 rows of the DataFrame: ', X.head())


"""
    Top 5 rows of the DataFrame:     agent  airline  airport    amp  austin  ...  wait  way  website  work  yes
    0    0.0      0.0      0.0  0.000     0.0  ...   0.0  0.0      0.0   0.0  0.0
    1    0.0      0.0      0.0  0.000     0.0  ...   0.0  0.0      0.0   0.0  0.0
    2    0.0      0.0      0.0  0.000     0.0  ...   0.0  0.0      0.0   0.0  0.0
    3    0.0      0.0      0.0  0.634     0.0  ...   0.0  0.0      0.0   0.0  0.0
    4    0.0      0.0      0.0  0.000     0.0  ...   0.0  0.0      0.0   0.0  0.0
    
    [5 rows x 100 columns]
"""

### Tfidf and a BOW on same data

In [None]:
# Import the required packages
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Build a BOW and tfidf vectorizers from the review column and with max of 100 features
vect1 = CountVectorizer(max_features=100).fit(reviews.review)
vect2 = TfidfVectorizer(max_features=100).fit(reviews.review)

# Transform the vectorizers
X1 = vect1.transform(reviews.review)
X2 = vect2.transform(reviews.review)
# Create DataFrames from the vectorizers
X_df1 = pd.DataFrame(X1.toarray(), columns=vect1.get_feature_names())
X_df2 = pd.DataFrame(X2.toarray(), columns=vect2.get_feature_names())
print('Top 5 rows, using BOW: \n', X_df1.head())
print('Top 5 rows using tfidf: \n', X_df2.head())

"""
    Top 5 rows, using BOW: 
        about  after  all  also  am  ...  with  work  would  you  your
    0      0      0    1     0   0  ...     1     0      2    0     1
    1      0      0    0     0   0  ...     0     0      1    1     0
    2      0      0    3     0   0  ...     0     1      1    2     0
    3      0      0    0     0   0  ...     0     0      0    0     0
    4      0      1    0     0   0  ...     0     0      0    3     1
    
    [5 rows x 100 columns]
    
    
    Top 5 rows using tfidf: 
        about  after    all  also   am  ...   with   work  would    you   your
    0    0.0  0.000  0.139   0.0  0.0  ...  0.113  0.000  0.307  0.000  0.175
    1    0.0  0.000  0.000   0.0  0.0  ...  0.000  0.000  0.139  0.106  0.000
    2    0.0  0.000  0.285   0.0  0.0  ...  0.000  0.139  0.105  0.160  0.000
    3    0.0  0.000  0.000   0.0  0.0  ...  0.000  0.000  0.000  0.000  0.000
    4    0.0  0.174  0.000   0.0  0.0  ...  0.000  0.000  0.000  0.328  0.163
    
    [5 rows x 100 columns]
"""

# 4. Let's Predict the Sentiment

## Let's Predict the Sentiment

### Logistic regression of movie reviews

In [None]:
# Import the logistic regression
from sklearn.linear_model import LogisticRegression

# Define the vector of targets and matrix of features
y = movies.label
X = movies.drop('label', axis=1)

# Build a logistic regression model and calculate the accuracy
log_reg = LogisticRegression().fit(X, y)
print('Accuracy of logistic regression: ', log_reg.score(X, y))

# Accuracy of logistic regression:  0.7852286361818425

### Logistic regression using Twitter data

In [None]:
# Define the vector of targets and matrix of features
y = tweets.airline_sentiment
X = tweets.drop('airline_sentiment', axis=1)

# Build a logistic regression model and calculate the accuracy
log_reg = LogisticRegression().fit(X, y)
print('Accuracy of logistic regression: ', log_reg.score(X, y))

# Create an array of prediction
y_predict = log_reg.predict(X)

# Print the accuracy using accuracy score
print('Accuracy of logistic regression: ', accuracy_score(y, y_predict))

"""
    Accuracy of logistic regression:  0.8555327868852459
    Accuracy of logistic regression:  0.8555327868852459
"""

## Did we really predict the sentiment well?

### Build and assess a model: movies reviews

In [None]:
# Import the required packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Define the vector of labels and matrix of features
y = movies.label
X = movies.drop('label', axis=1)

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a logistic regression model and print out the accuracy
log_reg = LogisticRegression().fit(X_train, y_train)
print('Accuracy on train set: ', log_reg.score(X_train, y_train))
print('Accuracy on test set: ', log_reg.score(X_test, y_test))

"""
Accuracy on train set:  0.7861666666666667
Accuracy on test set:  0.7521652231845436
"""

### Performance metrics of Twitter data

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

# Train a logistic regression
log_reg = LogisticRegression().fit(X_train, y_train)

# Make predictions on the test set
y_predicted = log_reg.predict(X_test)
 
# Print the performance metrics
print('Accuracy score test set: ', accuracy_score(y_test, y_predicted))
print('Confusion matrix test set: \n', confusion_matrix(y_test, y_predicted)/len(y_test))

"""
Accuracy score test set:  0.8031854379977247
Confusion matrix test set: 
     [[0.57337884 0.05346985 0.00568828]
     [0.04209329 0.13879408 0.02730375]
     [0.01934016 0.04891923 0.09101251]]
"""

### Build and assess a model: product reviews data

In [None]:
# Import the accuracy and confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build a logistic regression
log_reg = LogisticRegression().fit(X_train, y_train)

# Predict the labels 
y_predict = log_reg.predict(X_test)

# Print the performance metrics
print('Accuracy score of test data: ', accuracy_score(y_test, y_predict))
print('Confusion matrix of test data: \n', confusion_matrix(y_test, y_predict)/len(y_test))

"""
Accuracy score of test data:  0.7853333333333333
Confusion matrix of test data: 
     [[0.39333333 0.11266667]
     [0.102      0.392     ]]
"""

## Logistic regression: revisited

### Predict probabilities of movie reviews

In [None]:
# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=321)

# Train a logistic regression
log_reg = LogisticRegression().fit(X_train, y_train)

# Predict the probability of the 0 class
prob_0 = log_reg.predict_proba(X_test)[:, 0]
# Predict the probability of the 1 class
prob_1 = log_reg.predict_proba(X_test)[:, 1]

print("First 10 predicted probabilities of class 0: ", prob_0[:10])
print("First 10 predicted probabilities of class 1: ", prob_1[:10])

"""
First 10 predicted probabilities of class 0:  [0.86210184 0.90317521 0.60800676 0.15831127 0.86473322 0.87870788
     0.61080321 0.78899465 0.4451038  0.3082362 ]
     
First 10 predicted probabilities of class 1:  [0.13789816 0.09682479 0.39199324 0.84168873 0.13526678 0.12129212
     0.38919679 0.21100535 0.5548962  0.6917638 ]
"""

### Product reviews with regularization

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Train a logistic regression with regularization of 1000
log_reg1 = LogisticRegression(C=1000).fit(X_train, y_train)
# Train a logistic regression with regularization of 0.001
log_reg2 = LogisticRegression(C=0.001).fit(X_train, y_train)

# Print the accuracies
print('Accuracy of model 1: ', log_reg1.score(X_test, y_test))
print('Accuracy of model 2: ', log_reg2.score(X_test, y_test))

"""
    Accuracy of model 1:  0.786
    Accuracy of model 2:  0.7405
"""

### Regularizing models with Twitter data

In [None]:
# Build a logistic regression with regularizarion parameter of 100
log_reg1 = LogisticRegression(C=100).fit(X_train, y_train)
# Build a logistic regression with regularizarion parameter of 0.1
log_reg2 = LogisticRegression(C=0.1).fit(X_train, y_train)

# Predict the labels for each model
y_predict1 = log_reg1.predict(X_test)
y_predict2 = log_reg2.predict(X_test)

# Print performance metrics for each model
print('Accuracy of model 1: ', accuracy_score(y_test, y_predict1))
print('Accuracy of model 2: ', accuracy_score(y_test, y_predict2))
print('Confusion matrix of model 1: \n', confusion_matrix(y_test, y_predict1)/len(y_test))
print('Confusion matrix of model 2: \n', confusion_matrix(y_test, y_predict2)/len(y_test))


"""
Accuracy of model 1:  0.8156996587030717

Accuracy of model 2:  0.8156996587030717

Confusion matrix of model 1: 
     [[0.56484642 0.05290102 0.01023891]
     [0.02559727 0.16040956 0.02047782]
     [0.0221843  0.05290102 0.09044369]]
     
Confusion matrix of model 2: 
     [[0.58361775 0.04266212 0.00170648]
     [0.0443686  0.14675768 0.01535836]
     [0.02730375 0.05290102 0.08532423]]
"""

## Bringing it all together

### Step 1: Word cloud and feature creation

In [None]:
# Create and generate a word cloud image
cloud_positives = WordCloud(background_color='white').generate(positive_reviews)
 
# Display the generated wordcloud image
plt.imshow(cloud_positives, interpolation='bilinear') 
plt.axis("off")

# Don't forget to show the final image
plt.show()

In [None]:
# Tokenize each item in the review column
word_tokens = [word_tokenize(review) for review in reviews.review]

# Create an empty list to store the length of the reviews
len_tokens = []

# Iterate over the word_tokens list and determine the length of each item
for i in range(len(word_tokens)):
     len_tokens.append(len(word_tokens[i]))

# Create a new feature for the lengh of each review
reviews['n_words'] = len_tokens 

### Step 2: Building a vectorizer

In [None]:
# Import the TfidfVectorizer and default list of English stop words
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS 

# Build the vectorizer
vect = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1, 2), max_features=200, token_pattern=r'\b[^\d\W][^\d\W]+\b').fit(reviews.review)
# Create sparse matrix from the vectorizer
X = vect.transform(reviews.review)

# Create a DataFrame
reviews_transformed = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())
print('Top 5 rows of the DataFrame: \n', reviews_transformed.head())

"""
    Top 5 rows of the DataFrame: 
        able  action  actually  ago  album  ...  writing  written  wrong  year  years
    0   0.0     0.0       0.0  0.0    0.0  ...      0.0      0.0    0.0   0.0  0.000
    1   0.0     0.0       0.0  0.0    0.0  ...      0.0      0.0    0.0   0.0  0.209
    2   0.0     0.0       0.0  0.0    0.0  ...      0.0      0.0    0.0   0.0  0.152
    3   0.0     0.0       0.0  0.0    0.0  ...      0.0      0.0    0.0   0.0  0.000
    4   0.0     0.0       0.0  0.0    0.0  ...      0.0      0.0    0.0   0.0  0.000
    
    [5 rows x 200 columns]
"""

### Step 3: Building a classifier

In [None]:
# Define X and y
y = reviews_transformed.score
X = reviews_transformed.drop('score', axis=1)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=456)

# Train a logistic regression
log_reg = LogisticRegression().fit(X_train, y_train)
# Predict the labels
y_predicted = log_reg.predict(X_test)

# Print accuracy score and confusion matrix on test set
print('Accuracy on the test set: ', accuracy_score(y_test, y_predicted))
print(confusion_matrix(y_test, y_predicted)/len(y_test))

"""
Accuracy on the test set:  0.787
[[0.4115 0.1145]
[0.0985 0.3755]]
"""