<a href="https://colab.research.google.com/github/fmejias/CS534-ArtificialIntelligenceProject/blob/main/CS534-ArtificialIntelligenceProject/AI_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CS 534 - Artificial Intelligence**

## **Project Title: Text Mining and Sentiment Analysis on Twitter for predicting students dropping out during the pandemic.**

### **Students**


*   Merzia Adamjee
*   Alketa Guxha
*   Felipe Mejias
*   Nikita Boguslavskii




# **Initial configuration of the environment for the development of the project**

In [76]:
from google.colab import drive
from google.colab import files
import pandas as pd
import re
import numpy as np

from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# **Install Textblob and Imbalanced Learn**

In [None]:
!pip install textblob
!pip install imbalanced-learn

# **Google Authentication to read CSV File from Google Drive**

In [2]:
# Needed for Google Authentication Step
drive.mount('/content/drive')

Mounted at /content/drive


# **Upload Dataset from Google Drive**

In [6]:
DATASET_PATH = "/content/drive/My Drive/AI_Project_CS_534/Datasets/dropping_out_tweets_part1_labeled.csv"
dataset_df = pd.read_csv(DATASET_PATH, sep=";")

# **Dataset Information**

In [None]:
# Print a summary of the Dataset
result = dataset_df.head(10)
print("First 10 rows of the DataFrame:")
print(result)

# **Dataset Preprocessing**

## **Select labeled dataset**

In [45]:
# NOTE: This selection is because they are the only label rows
labeled_dataset_df = dataset_df.head(1200)

## **Filtering irrelevant examples**

In [46]:
IRRELEVANT_KEYWORDS = ["Bernie", "Trump", "Sanders", "to become", 
                       "to pursue", "and becoming", "and going",
                       "and be", "so I can", "so i can", "to run",
                       "to spend", "to focus", "and living", "marry",
                       "stripper", "and joining", "and pursuing",
                       "bts", "BTS", "and running", "to go", "and making",
                       "to dedicate"]

def filtering_irrelevant_examples(twitter_dataset):
  """
  Filtering irrelevant tweets from the Twitter dataset.
  """
  def check_tweet_relevance(tweet):
    """
    Filtering irrelevant tweets from the Twitter dataset.
    """
    if any(indicator in tweet for indicator in IRRELEVANT_KEYWORDS):
      return "irrelevant"
    return "relevant"
  return twitter_dataset[twitter_dataset["tweet"].apply(check_tweet_relevance) \
                         != "irrelevant"]

# Filter the irrelevant tweets
labeled_dataset_df = filtering_irrelevant_examples(labeled_dataset_df)
rows, columns = labeled_dataset_df.shape
print("New number of rows: ", rows)

New number of rows:  936


## **Convert all letters to lower case**

In [None]:
def convert_letters_to_lower_case(twitter_dataset):
  """
  Convert all letters to lower case.
  """
  def tweet_to_lower_case(tweet):
    """
    Convert tweet text to lower case.
    """
    return tweet.lower()

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(tweet_to_lower_case)

# Convert all tweets to lower case
convert_letters_to_lower_case(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after lower case:")
print(result)

## **Remove usernames that appear within a tweet**

In [None]:
def remove_usernames_from_tweets(twitter_dataset):
  """
  Remove all usernames that appear on a tweet.
  """
  def remove_username(tweet):
    """
    Remove username from tweet.
    """
    return re.sub('@[\w]+','', tweet)

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(remove_username)

# Remove all usernames that appear in a tweet
remove_usernames_from_tweets(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

## **Remove hashtags that appear within a tweet**

In [None]:
def remove_hashtags_from_tweets(twitter_dataset):
  """
  Remove all hashtags that appear on a tweet.
  """
  def remove_hashtags(tweet):
    """
    Remove hashtags from tweet.
    """
    return tweet.replace("#", "").replace("_", " ")

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(remove_hashtags)

# Remove all hashtags that appear in a tweet
remove_hashtags_from_tweets(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

## **Remove special characters and punctuation that appear within a tweet**

In [None]:
def remove_special_characters_and_punctuation_from_tweets(twitter_dataset):
  """
  Remove all special characters and punctuation that appear on a tweet.
  """
  def remove_special_characters_and_punctuation(tweet):
    """
    Remove special characters and punctuation from tweet.
    """
    return re.sub('[^A-Za-z0-9 ]+', '', tweet)

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(remove_special_characters_and_punctuation)

# Remove all special characters and punctuation that appear in a tweet
remove_special_characters_and_punctuation_from_tweets(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

## **Remove URLs that appear within a tweet**

In [None]:
def remove_urls_from_tweets(twitter_dataset):
  """
  Remove all urls that appear on a tweet.
  """
  def remove_urls(tweet):
    """
    Remove urls from tweet.
    """
    return re.sub(r'http\S+', '', tweet)

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(remove_urls)

# Remove all urls that appear in a tweet
remove_urls_from_tweets(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

# **Approach using Sentiment Analysis**

## **Create features using sentiment analysis and unigrams**

In [None]:
from textblob import TextBlob

def calculate_features_using_polarity(twitter_dataset):
  """
  Use Textblob polarity to calculate the number of positive and negative words.
  """
  def calculate_positive_words(tweet):
    """
    Count number of positive words in a tweet.
    """
    number_of_positive_words = 0
    for word in tweet.split():
      if TextBlob(word).polarity >= 0:
        number_of_positive_words = number_of_positive_words + 1
    return number_of_positive_words
  
  def calculate_ratio_positive_words(tweet):
    """
    Calculate the ratio of positive words in a tweet.
    """
    number_of_positive_words = 0
    for word in tweet.split():
      if TextBlob(word).polarity >= 0:
        number_of_positive_words = number_of_positive_words + 1
    return number_of_positive_words/len(tweet.split())
  
  def calculate_negative_words(tweet):
    """
    Count number of negative words in a tweet.
    """
    number_of_negative_words = 0
    for word in tweet.split():
      if TextBlob(word).polarity < 0:
        number_of_negative_words = number_of_negative_words + 1
    return number_of_negative_words
  
  def calculate_ratio_negative_words(tweet):
    """
    Calculate the ratio of negative words in a tweet.
    """
    number_of_negative_words = 0
    for word in tweet.split():
      if TextBlob(word).polarity < 0:
        number_of_negative_words = number_of_negative_words + 1
    return number_of_negative_words/len(tweet.split())
  
  twitter_dataset["positive_words"] = twitter_dataset["tweet"].apply(calculate_positive_words)
  twitter_dataset["ratio_positive_words"] = twitter_dataset["tweet"].apply(calculate_ratio_positive_words)
  twitter_dataset["negative_words"] = twitter_dataset["tweet"].apply(calculate_negative_words)
  twitter_dataset["ratio_negative_words"] = twitter_dataset["tweet"].apply(calculate_ratio_negative_words)

# Calculate new features using sentiment Analysis
calculate_features_using_polarity(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

## **Normalize calculated features**

In [None]:
# Select calculated features
dataset_sentiment_features = labeled_dataset_df[["positive_words", "ratio_positive_words", 
                                                 "negative_words", "ratio_negative_words"]]

# Normalize using the mean value
normalized_df = (dataset_sentiment_features - dataset_sentiment_features.mean())/dataset_sentiment_features.std()

# Show results
result = normalized_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

## **Handle imbalance classes using SMOTE**

In [None]:
print("Number of rows with intention of dropout: ", len(labeled_dataset_df[(labeled_dataset_df['label'] == "Intention of dropout")]))
print("Number of rows with no intention of dropout: ", len(labeled_dataset_df[(labeled_dataset_df['label'] == "Not intention of dropout")]))

# Select the data
X = normalized_df
Y = labeled_dataset_df[["label"]]

# Transform the dataset
oversample = SMOTE()
X, Y = oversample.fit_resample(X, np.ravel(Y))

print("Number of rows with intention of dropout: ", len(X))
print("Number of rows with no intention of dropout: ", len(Y))

## **Train logistic regression model**

In [87]:
# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
logistic_classifier = LogisticRegression(random_state = 0).fit(X_train,
                                                               Y_train)

# Calculate predictions
Y_pred = logistic_classifier.predict(X_test)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(Y_test, Y_pred))

Accuracy:  0.7387755102040816


## **Create features using sentiment analysis and bigrams**

In [None]:
from textblob import TextBlob

def find_ngrams(n, input_sequence):
  # Split sentence into tokens.
  tokens = input_sequence.split()
  ngrams = []
  for i in range(len(tokens) - n + 1):
    # Take n consecutive tokens in array.
    ngram = tokens[i:i+n]
    # Concatenate array items into string.
    ngram = ' '.join(ngram)
    ngrams.append(ngram)
  return ngrams

def calculate_bigram_features_using_polarity(twitter_dataset):
  """
  Use Textblob polarity to calculate the number of positive and negative words.
  """
  def calculate_positive_words(tweet):
    """
    Count number of positive words in a tweet.
    """
    number_of_positive_bigrams = 0
    ngrams = find_ngrams(2, tweet)
    for ngram in ngrams:
      if TextBlob(ngram).polarity >= 0:
        number_of_positive_bigrams = number_of_positive_bigrams + 1
    return number_of_positive_bigrams
  
  def calculate_ratio_positive_words(tweet):
    """
    Calculate the ratio of positive words in a tweet.
    """
    number_of_positive_bigrams = 0
    ngrams = find_ngrams(2, tweet)
    for ngram in ngrams:
      if TextBlob(ngram).polarity >= 0:
        number_of_positive_bigrams = number_of_positive_bigrams + 1
    return number_of_positive_bigrams/len(ngrams)
  
  def calculate_negative_words(tweet):
    """
    Count number of negative words in a tweet.
    """
    number_of_negative_bigrams = 0
    ngrams = find_ngrams(2, tweet)
    for ngram in ngrams:
      if TextBlob(ngram).polarity < 0:
        number_of_negative_bigrams = number_of_negative_bigrams + 1
    return number_of_negative_bigrams
  
  def calculate_ratio_negative_words(tweet):
    """
    Calculate the ratio of negative words in a tweet.
    """
    number_of_negative_bigrams = 0
    ngrams = find_ngrams(2, tweet)
    for ngram in ngrams:
      if TextBlob(ngram).polarity < 0:
        number_of_negative_bigrams = number_of_negative_bigrams + 1
    return number_of_negative_bigrams/len(ngrams)
  
  twitter_dataset["bigram_positive_words"] = twitter_dataset["tweet"].apply(calculate_positive_words)
  twitter_dataset["bigram_ratio_positive_words"] = twitter_dataset["tweet"].apply(calculate_ratio_positive_words)
  twitter_dataset["bigram_negative_words"] = twitter_dataset["tweet"].apply(calculate_negative_words)
  twitter_dataset["bigram_ratio_negative_words"] = twitter_dataset["tweet"].apply(calculate_ratio_negative_words)

# Calculate new features using sentiment Analysis
calculate_bigram_features_using_polarity(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

## **Normalize calculated bigram features**

In [None]:
# Select calculated features
dataset_sentiment_features = labeled_dataset_df[["bigram_positive_words", "bigram_ratio_positive_words",
                                                 "bigram_negative_words", "bigram_ratio_negative_words"]]

# Normalize using the mean value
normalized_df = (dataset_sentiment_features - dataset_sentiment_features.mean())/dataset_sentiment_features.std()

# Show results
result = normalized_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

## **Train logistic regression model**

In [None]:
# Select the data
X = normalized_df
Y = labeled_dataset_df[["label"]]

# Transform the dataset
oversample = SMOTE()
X, Y = oversample.fit_resample(X, np.ravel(Y))

# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
logistic_classifier = LogisticRegression(random_state = 0).fit(X_train,
                                                               Y_train)

# Calculate predictions
Y_pred = logistic_classifier.predict(X_test)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(Y_test, Y_pred))