In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Getting Familiar with GenAI Applications in Tech: Notebook 1

| | | |
|-|-|-|


## Binary Classification the ML Way

Let's start by looking at how we would set up a binary classification model using Machine Learning using the comments that you provided for our fictitional weatherbot.


In [1]:
#@title Training a sentiment classification model with sklearn

import pandas as pd

# Step 1: Get training data and look at it
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip -P data
!unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

sentiment_140_df = pd.read_csv("data/training.1600000.processed.noemoticon.csv",
                names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1')

# "positive" sentiment is labeled as "4", which is a little odd; let's rename it
# to "1"
sentiment_140_df.polarity = sentiment_140_df.polarity.replace({0: 0, 4: 1})

sentiment_140_df.sample(3)

--2024-07-19 10:49:37--  https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip
Resolving nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)... 162.243.189.2
Connecting to nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85088192 (81M) [application/zip]
Saving to: ‘data/training.1600000.processed.noemoticon.csv.zip’


2024-07-19 10:49:40 (29.8 MB/s) - ‘data/training.1600000.processed.noemoticon.csv.zip’ saved [85088192/85088192]

Archive:  data/training.1600000.processed.noemoticon.csv.zip
  inflating: data/training.1600000.processed.noemoticon.csv  


Unnamed: 0,polarity,id,date,query,user,text
557235,0,2204464895,Wed Jun 17 02:17:12 PDT 2009,NO_QUERY,ginkgink,Still not so many works to do today
931598,1,1770867341,Mon May 11 22:28:33 PDT 2009,NO_QUERY,treewatcher21,@cre8tvdirektr wow. this is cool
523658,0,2193205157,Tue Jun 16 08:08:22 PDT 2009,NO_QUERY,ciaara,i slept like 4 hours last night i need more s...


In [3]:
# Step 2: Turn words into embeddings, i.e. long lists of numbers that reflect
# co-occurrence, then train some sentiment classifiers

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

vectorizer = TfidfVectorizer()
X_tf = vectorizer.fit_transform(sentiment_140_df['text'])
y = sentiment_140_df['polarity']

models = {}
# next line is super, super slow... turns out that it's hard to fit a logistic
# regression model for this data. the model will eventually converge, but it has
# very poor accuracy, so I'm going to leave it out for now.
#models["Logistic Regression"] = LogisticRegression(max_iter=10000).fit(X_tf, y)
models["Naive Bayes"] = MultinomialNB().fit(X_tf, y)
models["DNN"] = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10,), random_state=42).fit(X_tf, y)

time: 12min 43s (started: 2024-07-19 11:06:45 +00:00)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [4]:
# Step 3: Define an object that contains your comments from the homework, then
# use the models to predict the comments' sentiments and compare that to the
# sentiment that you provided

import io

your_comments_df = pd.read_csv(io.StringIO('''
Comment,Sentiment
"i asked the bot if it would rain at 5pm and it said no so i went out without my umbrella and then it rained and i got completely soaked, would not recommend",negative
"You can ask it about the current or future temperature, UV index, or precipitation, and it will give you information with references so you can check the answers. Pretty cool",positive
love it,positive
stupid,negative
"The bot doesn’t just give you bland information like a weather report would do, but it makes you feel like you are actually in a conversation with someone.",positive
這個氣象APP不夠準確，他說今天不會下雨，但是下午卻下了大雷雨,negative
Works fine but could use improvements. I liked that the weather report was pretty accurate but it couldn’t answer my question about what to wear.,negative
"It turned out to be accurate, but I felt I would not rely on it; I would still check another weather source.",negative
Worked great until the most recent update!!!!1!,negative
This is the only weather bot I will use. Here are some suggestions to the developers for further improvements: add a feature where it outputs a summary based on zip code; allow syncing across multiple devices; include an option for dark mode on iPhone.,positive
"The bot said I didn’t need sunscreen today, so I didn’t apply any. I’m now taking an aloe bath to try and save my skin. Horrible app!",negative
"The bot has predicted rain for the past month. It hasn’t been wrong once, but I’m still pretty unhappy about it!",negative
Everyday is 95 with a chance of afternoon thunderstorms. Duh. This doesn’t tell me anything I didn’t already know.,negative
"Thunderstorms are happening in my area, and I wish I could know how close the lightning is, but it didn’t tell me that.",negative
"Everyday said 95 with a chance of afternoon thunderstorms, and it’s been right! So cool that this is a bot!",positive
It didn’t answer my question.,negative
It gave me a bunch of irrelevant information.,negative
I’d rather talk to a person.,negative
I asked the bot if it would be raining cats and dogs today and it said no. Imagine my surprise when I stepped outside into the pouring rain without my umbrella - I can’t tell if it took my question literally!,negative
I appreciate the bot’s accuracy and will continue using it regularly.,positive
meh,negative
idare eder,positive
I don't know what I would do without it!,positive
it ain't no human,negative
toomuch info,negative
The app keeps showing me ads and wants me to pay! Free app! Not!,negative
Hajur 这个app真是太棒了,positive
Why cant people just go outside and see the weather themselves,negative
este bot es muy divertido y funciona bien la vdd es k me gustó muchichichisimo y lo llamé bob ;),positive
It keeps saying it doesn’t understand. It’s annoying.,negative
Very limited number of responses. There are much better bots out there.,negative
برای سوال‌های ساده و سریع کار رو راه می‌اندازه.,positive
А поумнее б0та вы не могли придумать?,negative
"Нра юзать эту приложуху, разрабы - красавы!",positive
このアプリ最高！毎日使ってる,positive
האפליקציה צריכה עדכון כל שבווע!! סיוט,negative
It was raining outside and the bot still said it was sunny!,negative
Wegen der Scheißapp bin ich jetzt klatschnass!!1! Sofort deinstalliert,negative
"Einfach zu bedienen, klare Informationen, klasse Service!",positive
"Ich bin begeistert, wie höflich die Macher:innen der Applikation mit mir kommunizieren. Die Informationen könnten etwas aktueller sein, aber Höflichkeit schlägt alles, also von mir eine Empfehlung.",positive
"Der wetter-bot iz an oysgezeikhnter hilfsmittel! Er iz zeyer batamter, kenen identifitsiren intents fun forshungen, un derklernt zeyer azoy good. Oyb ir hot a frage vegn dem klimat, temperatur, oder regn, der wetter-bot vet zikh freylekh mitn antshuldikn. A poshet a mekhaye!",positive
"Der wetter-bot iz geven zeyer umhöflek un beleydikend. Er hot nit geantvort af mayne forshungen vi er volt gedarft, un in an ort fun helfn, hot er geshribn beleydikungen. Dos iz nit farvaylendik un zeyer umprofesyonel. Ikh bin zeyer antoysht un vel nit mer banutzn dem bot.",negative
Den katalava tipota,negative
"Καλό app, πολύ χρήσιμο για να ξέρω αν πρέπει να πάρω ομπρέλα το πρώι",positive
"MIN PISTEPSTE TO APP; TO EVGALAN MONO GIA NA MAS MPERDEPSOUN KAI NA MIN KATALAVOUME PWS MAS PSEKAZOUN EDW KAI XRONIA!!!! KANTE TIN EREYNA SAS, KSYPNISTE RE PROVATA!!!!! DEITE TIN ISTOSELIDA MOU GIA PARA PANW PLIRIFORIES!",negative
Bot em i no save long Tok Pisin :(,negative
I was actually looking for an app that was more like helping me to deal with my money issues. This is really bad at that,negative
Ich versteh die ganzen negativen Reviews nicht. Die meisten von den Leuten waren wahrscheinlich einfach zu doof für die Bedienung.,positive
"Yesterday morning I got up and thought to myself I need to buy some bread later today. I didn’t have much time in the morning because I needed to fix the plumbing in the bathroom, but then at noon I already had an appointment to get my eyes tested. Not really sure, what type of frames I want to get for my new glasses. I kind of like the square ones, but my sister told me to get the large round ones. Anyway, I got my bread on the way back home, was stopping by at the supermarket that’s right on the way. Was really hungry by that time, so I had two slices with some spread I picked up the other week at the farmer’s market. Of course, it’s important to stay hydrated as well, so I had a glass of water. And because I was still thirsty, I had another one. You know, I like drinking tap water, the tap water we have here is really good. Anyway, I was scrolling through the Playstore as I was sitting and drinking and that’s how I found the app.",positive
My pohne is getting super hot bruned rihgt through my pocket,negative
My cat is a meteorology genius compared to this app.,negative
Asked if I needed to take an umbrella with me today to work. It said no but it rained,negative
Un po’ scarso in italiano ma le previsioni sono accurate,positive
頼ったら痛い目にあった おかげさまでずぶ濡れアプリ出すんだったらちゃんとしたのを出せ,negative
抱着随便试试的心态用了这个app，没想到比黄大仙还准啊。早上 出门还是大晴天，刚过中午马上开始暴雨，和预报的一模一样。,positive
omg wow!!!!!!!!! another amazing update with fantastic new features that people DEFINITELY wanted!!!!!!!! I love how lsow it is now. /s,negative
$12 a month??????????????? in this economy????????????????? girl.,negative
charged me for sub even after I canceled. customer support no help,negative
omg such a cute lil bot. I would die for her.,positive
It’s a bot. It tells you the weather. It’s okay.,positive
'''), header=0)


your_comments_df.Sentiment = your_comments_df.Sentiment.replace({"positive": 1, "negative": 0})

# turn comments into embeddings
new_xs = vectorizer.transform(your_comments_df["Comment"])

# for each model, predict a sentiment for each comment, then add a new column
# that stores that prediction
for model_name, model in models.items():
  predicted_sentiment = model.predict(new_xs)
  your_comments_df["predicted_sentiment_per_" + model_name] = (
      predicted_sentiment + 1) // 2

# to facilitate processing, break these all out into lists
ground_truth = your_comments_df["Sentiment"].tolist()
predicted_naive_bayes = your_comments_df["predicted_sentiment_per_Naive Bayes"].tolist()
predicted_dnn = your_comments_df["predicted_sentiment_per_DNN"].tolist()

# calculate P/R for Naive Bayes
tp_naive_bayes = fp_naive_bayes = fn_naive_bayes = tn_naive_bayes = 0
for gt, prediction in zip(ground_truth, predicted_naive_bayes):
  if gt == 1 and prediction == 1:
    tp_naive_bayes += 1
  elif gt == 1 and prediction == 0:
    fn_naive_bayes += 1
  elif gt == 0 and prediction == 1:
    fp_naive_bayes += 1
  else:
    tn_naive_bayes += 1

precision_naive_bayes = tp_naive_bayes / (tp_naive_bayes + fp_naive_bayes) if (tp_naive_bayes + fp_naive_bayes) > 0 else 0
recall_naive_bayes = tp_naive_bayes / (tp_naive_bayes + fn_naive_bayes) if (tp_naive_bayes + fn_naive_bayes) > 0 else 0

# calculate P/R for DNN
tp_dnn = fp_dnn = fn_dnn = tn_dnn = 0
for gt, prediction in zip(ground_truth, predicted_dnn):
  if gt == 1 and prediction == 1:
    tp_dnn += 1
  elif gt == 1 and prediction == 0:
    fn_dnn += 1
  elif gt == 0 and prediction == 1:
    fp_dnn += 1
  else:
    tn_dnn += 1

precision_dnn = tp_dnn / (tp_dnn + fp_dnn) if (tp_dnn + fp_dnn) > 0 else 0
recall_dnn = tp_dnn / (tp_dnn + fn_dnn) if (tp_dnn + fn_dnn) > 0 else 0

# print all results
print("Naive Bayes:")
print(f"Precision: {precision_naive_bayes:.4f}")
print(f"Recall: {recall_naive_bayes:.4f}")
print("-" * 30)

print("DNN:")
print(f"Precision: {precision_dnn:.4f}")
print(f"Recall: {recall_dnn:.4f}")



Naive Bayes:
Precision: 0.4706
Recall: 0.3478
------------------------------
DNN:
Precision: 0.5517
Recall: 0.6957
time: 74.4 ms (started: 2024-07-19 11:19:50 +00:00)
