<a href="https://colab.research.google.com/github/fmejias/CS534-ArtificialIntelligenceProject/blob/main/AI_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CS 534 - Artificial Intelligence**

## **Project Title: Text Mining and Sentiment Analysis on Twitter for predicting students dropping out during the pandemic.**

### **Students**


*   Merzia Adamjee
*   Alketa Guxha
*   Felipe Mejias
*   Nikita Boguslavskii




# **Initial configuration of the environment for the development of the project**

In [445]:
from google.colab import drive
from google.colab import files
import pandas as pd
import re
import numpy as np

from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# **Install Textblob and Imbalanced Learn**

In [None]:
!pip install textblob
!pip install imbalanced-learn
!pip3 install flair

# **Google Authentication to read CSV File from Google Drive**

In [None]:
# Needed for Google Authentication Step
drive.mount('/content/drive')

# **Upload Dataset from Google Drive**

In [585]:
DATASET_PATH = "/content/drive/My Drive/AI_Project_CS_534/Datasets/dropping_out_tweets_part1_labeled.csv"
dataset_df = pd.read_csv(DATASET_PATH, sep=";")

# **Dataset Information**

In [None]:
# Print a summary of the Dataset
result = dataset_df.head(10)
print("First 10 rows of the DataFrame:")
print(result)

# **Dataset Preprocessing**

## **Select labeled dataset**

In [586]:
# NOTE: This selection is because they are the only label rows
labeled_dataset_df = dataset_df.head(1200)

## **Filtering irrelevant examples**

In [587]:
IRRELEVANT_KEYWORDS = ["Bernie", "Trump", "Sanders", "to become", 
                       "to pursue", "and becoming", "and going",
                       "and be", "so I can", "so i can", "to run",
                       "to spend", "to focus", "and living", "marry",
                       "stripper", "and joining", "and pursuing",
                       "bts", "BTS", "and running", "to go", "and making",
                       "to dedicate"]

def filtering_irrelevant_examples(twitter_dataset):
  """
  Filtering irrelevant tweets from the Twitter dataset.
  """
  def check_tweet_relevance(tweet):
    """
    Filtering irrelevant tweets from the Twitter dataset.
    """
    if any(indicator in tweet for indicator in IRRELEVANT_KEYWORDS):
      return "irrelevant"
    return "relevant"
  return twitter_dataset[twitter_dataset["tweet"].apply(check_tweet_relevance) \
                         != "irrelevant"]

# Filter the irrelevant tweets
labeled_dataset_df = filtering_irrelevant_examples(labeled_dataset_df)
rows, columns = labeled_dataset_df.shape
print("New number of rows: ", rows)

New number of rows:  936


## **Convert all letters to lower case**

In [588]:
def convert_letters_to_lower_case(twitter_dataset):
  """
  Convert all letters to lower case.
  """
  def tweet_to_lower_case(tweet):
    """
    Convert tweet text to lower case.
    """
    return tweet.lower()

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(tweet_to_lower_case)

# Convert all tweets to lower case
convert_letters_to_lower_case(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after lower case:")
print(result)

First 10 rows of the DataFrame after lower case:
                     id  ...                     label
0   1309219160828895233  ...      Intention of dropout
1   1308809031583236096  ...  Not intention of dropout
2   1308716552229998593  ...  Not intention of dropout
3   1308483739584835585  ...  Not intention of dropout
4   1308351921875345409  ...  Not intention of dropout
5   1307763236062650368  ...  Not intention of dropout
7   1306800980307083267  ...  Not intention of dropout
8   1305869148463992832  ...  Not intention of dropout
9   1305659343971311616  ...  Not intention of dropout
10  1305617685020053512  ...  Not intention of dropout

[10 rows x 3 columns]


## **Remove usernames that appear within a tweet**

In [589]:
def remove_usernames_from_tweets(twitter_dataset):
  """
  Remove all usernames that appear on a tweet.
  """
  def remove_username(tweet):
    """
    Remove username from tweet.
    """
    return re.sub('@[\w]+','', tweet)

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(remove_username)

# Remove all usernames that appear in a tweet
remove_usernames_from_tweets(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

First 10 rows of the DataFrame after removing usernames:
                     id  ...                     label
0   1309219160828895233  ...      Intention of dropout
1   1308809031583236096  ...  Not intention of dropout
2   1308716552229998593  ...  Not intention of dropout
3   1308483739584835585  ...  Not intention of dropout
4   1308351921875345409  ...  Not intention of dropout
5   1307763236062650368  ...  Not intention of dropout
7   1306800980307083267  ...  Not intention of dropout
8   1305869148463992832  ...  Not intention of dropout
9   1305659343971311616  ...  Not intention of dropout
10  1305617685020053512  ...  Not intention of dropout

[10 rows x 3 columns]


## **Remove hashtags that appear within a tweet**

In [590]:
def remove_hashtags_from_tweets(twitter_dataset):
  """
  Remove all hashtags that appear on a tweet.
  """
  def remove_hashtags(tweet):
    """
    Remove hashtags from tweet.
    """
    return tweet.replace("#", "").replace("_", " ")

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(remove_hashtags)

# Remove all hashtags that appear in a tweet
remove_hashtags_from_tweets(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

First 10 rows of the DataFrame after removing usernames:
                     id  ...                     label
0   1309219160828895233  ...      Intention of dropout
1   1308809031583236096  ...  Not intention of dropout
2   1308716552229998593  ...  Not intention of dropout
3   1308483739584835585  ...  Not intention of dropout
4   1308351921875345409  ...  Not intention of dropout
5   1307763236062650368  ...  Not intention of dropout
7   1306800980307083267  ...  Not intention of dropout
8   1305869148463992832  ...  Not intention of dropout
9   1305659343971311616  ...  Not intention of dropout
10  1305617685020053512  ...  Not intention of dropout

[10 rows x 3 columns]


## **Remove special characters and punctuation that appear within a tweet**

In [591]:
def remove_special_characters_and_punctuation_from_tweets(twitter_dataset):
  """
  Remove all special characters and punctuation that appear on a tweet.
  """
  def remove_special_characters_and_punctuation(tweet):
    """
    Remove special characters and punctuation from tweet.
    """
    return re.sub('[^A-Za-z0-9 ]+', '', tweet)

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(remove_special_characters_and_punctuation)

# Remove all special characters and punctuation that appear in a tweet
remove_special_characters_and_punctuation_from_tweets(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

First 10 rows of the DataFrame after removing usernames:
                     id  ...                     label
0   1309219160828895233  ...      Intention of dropout
1   1308809031583236096  ...  Not intention of dropout
2   1308716552229998593  ...  Not intention of dropout
3   1308483739584835585  ...  Not intention of dropout
4   1308351921875345409  ...  Not intention of dropout
5   1307763236062650368  ...  Not intention of dropout
7   1306800980307083267  ...  Not intention of dropout
8   1305869148463992832  ...  Not intention of dropout
9   1305659343971311616  ...  Not intention of dropout
10  1305617685020053512  ...  Not intention of dropout

[10 rows x 3 columns]


## **Remove URLs that appear within a tweet**

In [592]:
def remove_urls_from_tweets(twitter_dataset):
  """
  Remove all urls that appear on a tweet.
  """
  def remove_urls(tweet):
    """
    Remove urls from tweet.
    """
    return re.sub(r'http\S+', '', tweet)

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(remove_urls)

# Remove all urls that appear in a tweet
remove_urls_from_tweets(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

First 10 rows of the DataFrame after removing usernames:
                     id  ...                     label
0   1309219160828895233  ...      Intention of dropout
1   1308809031583236096  ...  Not intention of dropout
2   1308716552229998593  ...  Not intention of dropout
3   1308483739584835585  ...  Not intention of dropout
4   1308351921875345409  ...  Not intention of dropout
5   1307763236062650368  ...  Not intention of dropout
7   1306800980307083267  ...  Not intention of dropout
8   1305869148463992832  ...  Not intention of dropout
9   1305659343971311616  ...  Not intention of dropout
10  1305617685020053512  ...  Not intention of dropout

[10 rows x 3 columns]


## **Remove stop words that appear within a tweet**

In [593]:
from gensim.parsing.preprocessing import remove_stopwords

def remove_stop_words_from_tweets(twitter_dataset):
  """
  Remove all stop_words that appear on a tweet.
  """
  def remove_stop_words(tweet):
    """
    Remove stop_words from tweet.
    """
    return remove_stopwords(tweet)

  twitter_dataset["tweet"] = twitter_dataset["tweet"].apply(remove_stop_words)

# Remove all stop_words that appear in a tweet
remove_stop_words_from_tweets(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

First 10 rows of the DataFrame after removing usernames:
                     id  ...                     label
0   1309219160828895233  ...      Intention of dropout
1   1308809031583236096  ...  Not intention of dropout
2   1308716552229998593  ...  Not intention of dropout
3   1308483739584835585  ...  Not intention of dropout
4   1308351921875345409  ...  Not intention of dropout
5   1307763236062650368  ...  Not intention of dropout
7   1306800980307083267  ...  Not intention of dropout
8   1305869148463992832  ...  Not intention of dropout
9   1305659343971311616  ...  Not intention of dropout
10  1305617685020053512  ...  Not intention of dropout

[10 rows x 3 columns]


# **Approach using Sentiment Analysis**

## **Create features using sentiment analysis and unigrams and Textblob**

In [594]:
from textblob import TextBlob

def calculate_features_using_polarity(twitter_dataset):
  """
  Use Textblob polarity to calculate the number of positive and negative words.
  """
  def calculate_positive_words(tweet):
    """
    Count number of positive words in a tweet.
    """
    number_of_positive_words = 0
    for word in tweet.split():
      if TextBlob(word).polarity > 0:
        number_of_positive_words = number_of_positive_words + 1
    return number_of_positive_words
  
  def calculate_positive_tweet_score(tweet):
    """
    Count number of positive words in a tweet.
    """
    score = 0
    for word in tweet.split():
      polarity_score = TextBlob(word).polarity
      if polarity_score > 0:
        score = score + polarity_score
    return score
  
  def calculate_ratio_positive_words(tweet):
    """
    Calculate the ratio of positive words in a tweet.
    """
    number_of_positive_words = 0
    for word in tweet.split():
      if TextBlob(word).polarity > 0:
        number_of_positive_words = number_of_positive_words + 1
    return number_of_positive_words/len(tweet.split())
  
  def calculate_negative_words(tweet):
    """
    Count number of negative words in a tweet.
    """
    number_of_negative_words = 0
    for word in tweet.split():
      if TextBlob(word).polarity < 0:
        number_of_negative_words = number_of_negative_words + 1
    return number_of_negative_words
  
  def calculate_negative_tweet_score(tweet):
    """
    Count number of negative words in a tweet.
    """
    score = 0
    for word in tweet.split():
      polarity_score = TextBlob(word).polarity
      if polarity_score < 0:
        score = score + polarity_score
    return score
  
  def calculate_ratio_negative_words(tweet):
    """
    Calculate the ratio of negative words in a tweet.
    """
    number_of_negative_words = 0
    for word in tweet.split():
      if TextBlob(word).polarity < 0:
        number_of_negative_words = number_of_negative_words + 1
    return number_of_negative_words/len(tweet.split())
  
  twitter_dataset["unigram_number_positive_words"] = twitter_dataset["tweet"].apply(calculate_positive_words)
  twitter_dataset["unigram_ratio_positive_words"] = twitter_dataset["tweet"].apply(calculate_ratio_positive_words)
  twitter_dataset["unigram_number_negative_words"] = twitter_dataset["tweet"].apply(calculate_negative_words)
  twitter_dataset["unigram_ratio_negative_words"] = twitter_dataset["tweet"].apply(calculate_ratio_negative_words)
  twitter_dataset["unigram_positive_score"] = twitter_dataset["tweet"].apply(calculate_positive_tweet_score)
  twitter_dataset["unigram_negative_score"] = twitter_dataset["tweet"].apply(calculate_negative_tweet_score)

# Calculate new features using sentiment Analysis
calculate_features_using_polarity(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

First 10 rows of the DataFrame after removing usernames:
                     id  ... unigram_negative_score
0   1309219160828895233  ...                  -0.40
1   1308809031583236096  ...                   0.00
2   1308716552229998593  ...                  -0.30
3   1308483739584835585  ...                   0.00
4   1308351921875345409  ...                  -0.25
5   1307763236062650368  ...                   0.00
7   1306800980307083267  ...                   0.00
8   1305869148463992832  ...                   0.00
9   1305659343971311616  ...                   0.00
10  1305617685020053512  ...                  -0.70

[10 rows x 9 columns]


## **Create features using sentiment analysis and bigrams and Textblob**

In [595]:
from textblob import TextBlob

def find_ngrams(n, input_sequence):
  # Split sentence into tokens.
  tokens = input_sequence.split()
  ngrams = []
  for i in range(len(tokens) - n + 1):
    # Take n consecutive tokens in array.
    ngram = tokens[i:i+n]
    # Concatenate array items into string.
    ngram = ' '.join(ngram)
    ngrams.append(ngram)
  return ngrams

def calculate_bigram_features_using_polarity(twitter_dataset):
  """
  Use Textblob polarity to calculate the number of positive and negative words.
  """
  def calculate_positive_words(tweet):
    """
    Count number of positive words in a tweet.
    """
    number_of_positive_bigrams = 0
    ngrams = find_ngrams(2, tweet)
    for ngram in ngrams:
      if TextBlob(ngram).polarity > 0:
        number_of_positive_bigrams = number_of_positive_bigrams + 1
    return number_of_positive_bigrams
  
  def calculate_positive_tweet_score(tweet):
    """
    Count number of positive words in a tweet.
    """
    score = 0
    ngrams = find_ngrams(2, tweet)
    for ngram in ngrams:
      polarity_score = TextBlob(ngram).polarity
      if polarity_score > 0:
        score = score + polarity_score
    return score
  
  def calculate_ratio_positive_words(tweet):
    """
    Calculate the ratio of positive words in a tweet.
    """
    number_of_positive_bigrams = 0
    ngrams = find_ngrams(2, tweet)
    for ngram in ngrams:
      if TextBlob(ngram).polarity >= 0:
        number_of_positive_bigrams = number_of_positive_bigrams + 1
    return number_of_positive_bigrams/len(ngrams)
  
  def calculate_negative_words(tweet):
    """
    Count number of negative words in a tweet.
    """
    number_of_negative_bigrams = 0
    ngrams = find_ngrams(2, tweet)
    for ngram in ngrams:
      if TextBlob(ngram).polarity < 0:
        number_of_negative_bigrams = number_of_negative_bigrams + 1
    return number_of_negative_bigrams
  
  def calculate_negative_tweet_score(tweet):
    """
    Count number of negative words in a tweet.
    """
    score = 0
    ngrams = find_ngrams(2, tweet)
    for ngram in ngrams:
      polarity_score = TextBlob(ngram).polarity
      if polarity_score < 0:
        score = score + polarity_score
    return score
  
  def calculate_ratio_negative_words(tweet):
    """
    Calculate the ratio of negative words in a tweet.
    """
    number_of_negative_bigrams = 0
    ngrams = find_ngrams(2, tweet)
    for ngram in ngrams:
      if TextBlob(ngram).polarity < 0:
        number_of_negative_bigrams = number_of_negative_bigrams + 1
    return number_of_negative_bigrams/len(ngrams)
  
  twitter_dataset["bigram_number_positive_words"] = twitter_dataset["tweet"].apply(calculate_positive_words)
  twitter_dataset["bigram_ratio_positive_words"] = twitter_dataset["tweet"].apply(calculate_ratio_positive_words)
  twitter_dataset["bigram_number_negative_words"] = twitter_dataset["tweet"].apply(calculate_negative_words)
  twitter_dataset["bigram_ratio_negative_words"] = twitter_dataset["tweet"].apply(calculate_ratio_negative_words)
  twitter_dataset["bigram_positive_score"] = twitter_dataset["tweet"].apply(calculate_positive_tweet_score)
  twitter_dataset["bigram_negative_score"] = twitter_dataset["tweet"].apply(calculate_negative_tweet_score)

# Calculate new features using sentiment Analysis
calculate_bigram_features_using_polarity(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

First 10 rows of the DataFrame after removing usernames:
                     id  ... bigram_negative_score
0   1309219160828895233  ...                 -0.80
1   1308809031583236096  ...                  0.00
2   1308716552229998593  ...                 -0.60
3   1308483739584835585  ...                  0.00
4   1308351921875345409  ...                 -0.50
5   1307763236062650368  ...                  0.00
7   1306800980307083267  ...                  0.00
8   1305869148463992832  ...                  0.00
9   1305659343971311616  ...                  0.00
10  1305617685020053512  ...                 -0.75

[10 rows x 15 columns]


## **Create features using sentiment analysis and unigrams and Vader**

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sentiment_analyzer = SentimentIntensityAnalyzer()

In [597]:
def calculate_features_using_polarity_vader(twitter_dataset):
  """
  Use Vader polarity to calculate the number of positive and negative words.
  """
  def calculate_positive_words(tweet):
    """
    Count number of positive words in a tweet.
    """
    number_of_positive_words = 0
    for word in tweet.split():
      if sentiment_analyzer.polarity_scores(word)["compound"] > 0:
        number_of_positive_words = number_of_positive_words + 1
    return number_of_positive_words
  
  def calculate_ratio_positive_words(tweet):
    """
    Calculate the ratio of positive words in a tweet.
    """
    number_of_positive_words = 0
    for word in tweet.split():
      if sentiment_analyzer.polarity_scores(word)["compound"] > 0:
        number_of_positive_words = number_of_positive_words + 1
    return number_of_positive_words/len(tweet.split())
  
  def calculate_negative_words(tweet):
    """
    Count number of negative words in a tweet.
    """
    number_of_negative_words = 0
    for word in tweet.split():
      if sentiment_analyzer.polarity_scores(word)["compound"] < 0:
        number_of_negative_words = number_of_negative_words + 1
    return number_of_negative_words
  
  def calculate_ratio_negative_words(tweet):
    """
    Calculate the ratio of negative words in a tweet.
    """
    number_of_negative_words = 0
    for word in tweet.split():
      if sentiment_analyzer.polarity_scores(word)["compound"] < 0:
        number_of_negative_words = number_of_negative_words + 1
    return number_of_negative_words/len(tweet.split())
  
  twitter_dataset["unigram_vader_positive_words"] = twitter_dataset["tweet"].apply(calculate_positive_words)
  twitter_dataset["unigram_vader_ratio_positive_words"] = twitter_dataset["tweet"].apply(calculate_ratio_positive_words)
  twitter_dataset["unigram_vader_negative_words"] = twitter_dataset["tweet"].apply(calculate_negative_words)
  twitter_dataset["unigram_vader_ratio_negative_words"] = twitter_dataset["tweet"].apply(calculate_ratio_negative_words)

# Calculate new features using sentiment Analysis
calculate_features_using_polarity_vader(labeled_dataset_df)

# Show results
result = labeled_dataset_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

First 10 rows of the DataFrame after removing usernames:
                     id  ... unigram_vader_ratio_negative_words
0   1309219160828895233  ...                           0.111111
1   1308809031583236096  ...                           0.047619
2   1308716552229998593  ...                           0.173913
3   1308483739584835585  ...                           0.142857
4   1308351921875345409  ...                           0.181818
5   1307763236062650368  ...                           0.047619
7   1306800980307083267  ...                           0.000000
8   1305869148463992832  ...                           0.055556
9   1305659343971311616  ...                           0.000000
10  1305617685020053512  ...                           0.111111

[10 rows x 19 columns]


## **Select features to train**

In [728]:
# Select calculated features
dataset_sentiment_features = labeled_dataset_df[["unigram_number_positive_words", 
                                                 "unigram_ratio_positive_words", 
                                                 "unigram_number_negative_words", 
                                                 "unigram_ratio_negative_words",
                                                 "unigram_positive_score",
                                                 "unigram_negative_score",
                                                 "bigram_number_positive_words", 
                                                 "bigram_ratio_positive_words", 
                                                 "bigram_number_negative_words", 
                                                 "bigram_ratio_negative_words",
                                                 "bigram_positive_score",
                                                 "bigram_negative_score",
                                                 "unigram_vader_positive_words", 
                                                 "unigram_vader_ratio_positive_words", 
                                                 "unigram_vader_negative_words", 
                                                 "unigram_vader_ratio_negative_words"]]

## **Normalize calculated features using Pandas**

In [524]:
# Normalize using the mean value
normalized_df = (dataset_sentiment_features - dataset_sentiment_features.mean())/dataset_sentiment_features.std()

# Show results
result = normalized_df.head(10)
print("First 10 rows of the DataFrame after removing usernames:")
print(result)

First 10 rows of the DataFrame after removing usernames:
    unigram_number_positive_words  ...  unigram_vader_ratio_negative_words
0                        0.351245  ...                            0.592879
1                        0.351245  ...                           -0.281145
2                        2.610803  ...                            1.457402
3                       -0.778534  ...                            1.029891
4                       -0.778534  ...                            1.566223
5                       -0.778534  ...                           -0.281145
7                        0.351245  ...                           -0.936663
8                        1.481024  ...                           -0.171892
9                       -0.778534  ...                           -0.936663
10                       1.481024  ...                            0.592879

[10 rows x 16 columns]


## **Normalize calculated features using Scaler**

In [430]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Normalize using the mean value
sc = StandardScaler()
normalized_df = sc.fit_transform(dataset_sentiment_features)

## **Normalize calculated features using MinMaxScaler**

In [731]:
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler

# Normalize using the mean value
sc = MinMaxScaler()
normalized_df = sc.fit_transform(dataset_sentiment_features)

## **Select X and Y**

In [732]:
# Select the data
def select_data(normalize_data = False):
  if normalize_data:
    return normalized_df, labeled_dataset_df.label
  return dataset_sentiment_features, labeled_dataset_df.label

X, Y = select_data(normalize_data = True)

## **Handle imbalance classes using SMOTE**

In [None]:
print("Number of rows with intention of dropout: ", 
      len(labeled_dataset_df[(labeled_dataset_df['label'] == "Intention of dropout")]))
print("Number of rows with no intention of dropout: ", 
      len(labeled_dataset_df[(labeled_dataset_df['label'] == "Not intention of dropout")]))

# Transform the dataset
oversample = SMOTE()
X, Y = oversample.fit_resample(X, Y)

## **Train logistic regression model**

In [736]:
from sklearn.metrics import accuracy_score

# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
logistic_classifier = LogisticRegression(random_state = 0).fit(X_train,
                                                               Y_train)

# Calculate training accuracy
Y_pred = logistic_classifier.predict(X_train)
print("Training Accuracy: ", accuracy_score(Y_train, Y_pred))

# Calculate test accuracy
Y_pred = logistic_classifier.predict(X_test)
print("Testing Accuracy: ", accuracy_score(Y_test, Y_pred))

Training Accuracy:  0.6786079836233367
Testing Accuracy:  0.6816326530612244


## **Train Random Forest Classifier model**

In [738]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
random_forest_classifier = RandomForestClassifier(n_estimators=100)
random_forest_classifier.fit(X_train, Y_train)

# Calculate training accuracy
Y_pred = random_forest_classifier.predict(X_train)
print("Training Accuracy: ", accuracy_score(Y_train, Y_pred))

# Calculate test accuracy
Y_pred = random_forest_classifier.predict(X_test)
print("Testing Accuracy: ", accuracy_score(Y_test, Y_pred))

Training Accuracy:  0.932446264073695
Testing Accuracy:  0.8


## **Train Adaboost Classifier model**

In [780]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
adaboost_classifier = AdaBoostClassifier(n_estimators = 100)
adaboost_classifier.fit(X_train, Y_train)

# Calculate training accuracy
Y_pred = adaboost_classifier.predict(X_train)
print("Training Accuracy: ", accuracy_score(Y_train, Y_pred))

# Calculate test accuracy
Y_pred = adaboost_classifier.predict(X_test)
print("Testing Accuracy: ", accuracy_score(Y_test, Y_pred))

Training Accuracy:  0.7850562947799385
Testing Accuracy:  0.7673469387755102


## **Train Gradient Boost Classifier model**

In [787]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
gradient_boost_classifier = GradientBoostingClassifier(n_estimators = 100)
gradient_boost_classifier.fit(X_train, Y_train)

# Calculate training accuracy
Y_pred = gradient_boost_classifier.predict(X_train)
print("Training Accuracy: ", accuracy_score(Y_train, Y_pred))

# Calculate test accuracy
Y_pred = gradient_boost_classifier.predict(X_test)
print("Testing Accuracy: ", accuracy_score(Y_test, Y_pred))

Training Accuracy:  0.8669396110542477
Testing Accuracy:  0.8


## **Train XG Gradient Boost Classifier model**

In [807]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
xg_gradient_boost_classifier = xgb.XGBClassifier(objective="binary:logistic", 
                                                 random_state=42)
xg_gradient_boost_classifier.fit(X_train, Y_train)

# Calculate training accuracy
Y_pred = xg_gradient_boost_classifier.predict(X_train)
print("Training Accuracy: ", accuracy_score(Y_train, Y_pred))

# Calculate test accuracy
Y_pred = xg_gradient_boost_classifier.predict(X_test)
print("Testing Accuracy: ", accuracy_score(Y_test, Y_pred))

Training Accuracy:  0.8372569089048106
Testing Accuracy:  0.7306122448979592


## **Train Multinomial Naive Bayes Classifier model**

In [740]:
from sklearn.naive_bayes import MultinomialNB

# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
multinomial_naive_bayes_classifier = MultinomialNB()
multinomial_naive_bayes_classifier.fit(X_train, Y_train)

# Calculate training accuracy
Y_pred = multinomial_naive_bayes_classifier.predict(X_train)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(Y_train, Y_pred))

Accuracy:  0.6212896622313203


## **Train Decision Tree Classifier model**

In [741]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, Y_train)

# Calculate training accuracy
Y_pred = decision_tree_classifier.predict(X_train)
print("Training Accuracy: ", accuracy_score(Y_train, Y_pred))

# Calculate test accuracy
Y_pred = decision_tree_classifier.predict(X_test)
print("Testing Accuracy: ", accuracy_score(Y_test, Y_pred))

Training Accuracy:  0.9314227226202662
Testing Accuracy:  0.7959183673469388


## **Train SVM model**

In [744]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
svm_classifier = SVC()
svm_classifier.fit(X_train, Y_train)

# Calculate training accuracy
Y_pred = svm_classifier.predict(X_train)
print("Training Accuracy: ", accuracy_score(Y_train, Y_pred))

# Calculate test accuracy
Y_pred = svm_classifier.predict(X_test)
print("Testing Accuracy: ", accuracy_score(Y_test, Y_pred))

Training Accuracy:  0.7093142272262026
Testing Accuracy:  0.726530612244898


## **Train Stacking Ensemble model**

In [814]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Define the base learners models
base_learners = list()
base_learners.append(('bayes', GaussianNB()))
base_learners.append(('rf', RandomForestClassifier(n_estimators=100)))
base_learners.append(('cart', DecisionTreeClassifier()))
base_learners.append(('svm', SVC()))

# Define the meta learner model
meta_learner = LogisticRegression()

# Define the stacking ensemble
stacking_ensemble = StackingClassifier(estimators = base_learners, 
                                       final_estimator = meta_learner, 
                                       cv = 4)

# Train the model
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2)
stacking_ensemble.fit(X_train, Y_train)

# Calculate training accuracy
Y_pred = stacking_ensemble.predict(X_train)
print("Training Accuracy: ", accuracy_score(Y_train, Y_pred))

# Calculate test accuracy
Y_pred = stacking_ensemble.predict(X_test)
print("Testing Accuracy: ", accuracy_score(Y_test, Y_pred))

Training Accuracy:  0.9293756397134084
Testing Accuracy:  0.7836734693877551
