# Model & Evaluation

## 1. Load Packages and Datasets

In [117]:
import json
import os
import re
from collections import Counter

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, HTML
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [118]:
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [119]:
# Load pre-processed train and test set from EDA notebook
train_df = pd.read_json('./../data/preprocess_train.json')
test_df = pd.read_json('./../data/preprocess_test.json')

raw_train_df = pd.read_json('./../data/train.json')
raw_test_df = pd.read_json('./../data/test.json')

## 2. Model Building

### 2.1 Topic Model

In [77]:
# Sentiment analysis
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Define a function to perform sentiment analysis on a single piece of text
def analyze_sentiment(text):
    # Get sentiment scores
    scores = sid.polarity_scores(text)

    # Classify sentiment
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis to your text data
train_df['sentiment'] = train_df['full_text'].apply(analyze_sentiment)
test_df['sentiment'] = test_df['full_text'].apply(analyze_sentiment)

# Print some sample results
print("Sample results from training data:")
print(train_df[['full_text', 'sentiment']].head())

print("\nSample results from test data:")
print(test_df[['full_text', 'sentiment']].head())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jvo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Sample results from training data:
                                           full_text sentiment
0  Design Thinking for innovation reflexion-Avril...  positive
1  Diego Estrada\n\nDesign Thinking Assignment\n\...  positive
2  Reporting process\n\nby Gilberto Gamboa\n\nCha...  positive
3  Design Thinking for Innovation\n\nSindy Samaca...  positive
4  Assignment:  Visualization Reflection  Submitt...  positive

Sample results from test data:
                                           full_text sentiment
0  Design Thinking for innovation reflexion-Avril...  positive
1  Diego Estrada\n\nDesign Thinking Assignment\n\...  positive
2  Reporting process\n\nby Gilberto Gamboa\n\nCha...  positive
3  Design Thinking for Innovation\n\nSindy Samaca...  positive
4  Assignment:  Visualization Reflection  Submitt...  positive


In [78]:
# NMF Model
corpus_train = train_df['tokens_processed'].apply(lambda tokens: ' '.join(tokens))
corpus_test = test_df['tokens_processed'].apply(lambda tokens: ' '.join(tokens))
corpus = pd.concat([corpus_train, corpus_test], ignore_index=True)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_X = tfidf_vectorizer.fit_transform(corpus)

# Count Vectorizer
count_vectorizer = CountVectorizer(max_features=1000)
count_X = count_vectorizer.fit_transform(corpus)

# Initialize and fit NMF model
num_topics = 5
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf_X)

def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1]
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print(" %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))


display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out())




Topic 00
 launch (2.04)
 learning (1.69)
 team (1.22)
 product (0.97)
 customers (0.82)

Topic 01
 storytelling (1.88)
 story (1.86)
 people (1.17)
 stories (1.12)
 audience (0.60)

Topic 02
 mind (3.08)
 mapping (2.43)
 ideas (1.18)
 map (1.17)
 tool (0.86)

Topic 03
 graphic (2.91)
 visualization (1.92)
 group (1.77)
 problem (1.66)
 straw (1.35)

Topic 04
 students (9.79)
 student (2.16)
 school (2.15)
 teachers (1.55)
 class (1.37)


In [79]:
#LSA Model
lsa_model = TruncatedSVD(n_components=num_topics, random_state=42)
lsa_model.fit(tfidf_X)

# Display topics
display_topics(lsa_model, tfidf_vectorizer.get_feature_names_out())


Topic 00
 tool (0.64)
 team (0.59)
 mind (0.59)
 learning (0.50)
 design (0.50)

Topic 01
 story (42.47)
 storytelling (41.01)
 stories (27.16)
 people (18.62)
 telling (12.54)

Topic 02
 mind (85.48)
 mapping (66.68)
 map (33.79)
 graphic (26.18)
 ideas (25.05)

Topic 03
 graphic (18.38)
 visualization (9.30)
 group (8.78)
 straw (8.70)
 man (7.74)

Topic 04
 students (56.99)
 learning (19.50)
 launch (14.99)
 school (12.87)
 student (12.75)


In [80]:
# LDA Model
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(count_X)

# Display topics
display_topics(lda_model, count_vectorizer.get_feature_names_out())


Topic 00
 storytelling (1.91)
 story (1.83)
 people (1.80)
 tool (1.07)
 one (0.94)

Topic 01
 visualization (2.11)
 process (1.60)
 tool (1.38)
 team (1.35)
 would (1.04)

Topic 02
 mind (4.53)
 mapping (3.24)
 ideas (2.01)
 tool (1.89)
 design (1.82)

Topic 03
 group (3.27)
 problem (2.96)
 graphic (2.61)
 insights (2.05)
 identify (1.74)

Topic 04
 learning (2.32)
 launch (2.17)
 team (1.66)
 product (1.35)
 customers (1.24)


### 2.2 Random Forest Model

In [115]:
# Convert list of tokens back to strings
train_df['tokens_joined'] = train_df['tokens_processed'].apply(lambda tokens: ' '.join(tokens))

# Extract features and labels
X = train_df['tokens_joined']
y = train_df['labels_processed']

mlb = MultiLabelBinarizer()
y_bin = mlb.fit_transform(y)

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X_tfidf, y_bin, test_size=0.2, random_state=599)

In [116]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=599)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = rf_classifier.predict(X_valid)

# Evaluate the model
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00        12
           2       1.00      0.01      0.01       183
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         8
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00       169
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       1.00      1.00      1.00      1362

   micro avg       1.00      0.78      0.88      1744
   macro avg       0.15      0.08      0.08      1744
weighted avg       0.89      0.78      0.78      1744
 samples avg       1.00      0.91      0.93      1744



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
