In [1]:
import requests
from bs4 import BeautifulSoup
import pandas
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd 


names=["Sheldon", "Leonard", "Howard", "Raj", "Penny"]

#### Can we predict the character of a series from its dialogues?


In this exercise we are going to do web scraping of dialogues from the Big Bang Theory series, our goal is to train a model that learns to distinguish between the 5 main characters from the words they use.

In [2]:
url_principal="https://bigbangtrans.wordpress.com"



In [3]:
### get response
response = requests.get(url_principal)
soup = BeautifulSoup(response.text, "html.parser")
#Que obtengo?
print(response)
print(bool(response)) #Un boleano con se pudo acceder a la red 

<Response [200]>
True


## Get a list with the url of the 231 episodes.

In [4]:
chapters = []

# Suponiendo que ya tienes importado BeautifulSoup y que tienes tu objeto soup
for item in soup.find_all("div", attrs={"id":"pages-2"}):
    links = item.find_all("li", class_="page_item")
    for link in links:
        href = link.find("a")["href"]
        chapters.append(href)



The first link is an about, so we are not interested in having it.

In [5]:
chapters=chapters[1:]

In [6]:
data = {}
for i in range(len(chapters)):
    response = requests.get(chapters[i])
    soup = BeautifulSoup(response.text, "html.parser")
    dx= soup.find('div', attrs={'class':'entrytext'})
    dialogos = []
    for j in dx.find_all('p'):
        dialogos.append(j.text)
    data[f'capitulo {str(i+1)}']=dialogos




In [7]:
Sheldon={}
Leonard={}
Howard={}
Raj={}
Penny={}

for key,value in data.items():
    Sheldon[key]=[]
    Leonard[key]=[]
    Howard[key]=[]
    Raj[key]=[]
    Penny[key]=[]
    for i in value:
        if "Sheldon:" in i:
            Sheldon[key].append(i.strip("Sheldon:"))
        if "Leonard:" in i:
            Leonard[key].append(i.strip("Leonard:"))
        if "Howard:" in i:
            Howard[key].append(i.strip("Howard:"))
        if "Raj:" in i:
            Raj[key].append(i.strip("Raj:"))
        if "Penny:" in i:
            Penny[key].append(i.strip("Penny:"))
            
    Sheldon[key]=''.join(Sheldon[key])
    Leonard[key]=''.join(Leonard[key])
    Howard[key]=''.join(Howard[key])
    Raj[key]=''.join(Raj[key])
    Penny[key]=''.join(Penny[key])
        

In [8]:
df=pd.DataFrame({"Sheldon":Sheldon,"Leonard":Leonard,"Howard":Howard,"Raj":Raj,"Penny":Penny})

In [9]:
df_final=pd.melt(df,var_name="Personaje",value_name="Texto")

In [10]:
df_final.to_csv("bigbang.csv")

In [11]:
df_final

Unnamed: 0,Personaje,Texto
0,Sheldon,So if a photon is directed through a plane wi...
1,Sheldon,"Since it’s not bee season, you can have my ep..."
2,Sheldon,Good lord! Lock and load. I’ve got the Sword...
3,Sheldon,I’ve been thinking about time travel again. P...
4,Sheldon,"Alright, I’m moving my infantry division, aug..."
...,...,...
1150,Penny,"Hey, Leonard, if you’re not busy tomorrow, I ..."
1151,Penny,"Oh, wait. I remember. Yes. Please find someon..."
1152,Penny,I didn’t know you could drink while you’re br...
1153,Penny,"Champagne, champagne. And for the world’s tal..."


Our goal is to be able to transform the words into a friendly format in order to finally convert them into numerical values that represent whether that word appears in our text or not.

In [12]:
import re

# Importamos la función que nos permite Stemmizar de nltk y definimos el stemmer
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Traemos nuevamente las stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [13]:
def normalizar_y_stemmizar(titular):
    # Vamos a reemplzar los caracteres que no sean letras por espacios
    titular=re.sub("[^a-zA-Z]"," ",str(titular)) # he quitado todos los símbolos que no sean letras 
    # Pasamos todo a minúsculas
    titular=titular.lower() # minúsculas
    titular = nltk.tokenize.RegexpTokenizer("[\w]+").tokenize(titular) #tokenizado
    # Eliminamos las palabras de menos de 3 letras
    titular = [palabra for palabra in titular if len(palabra)>3]
    # Sacamos las Stopwords
    titular = [word for word in titular if word not in stopwords]
    ## Hasta aqui Normalizamos, ahora a stemmizar
    
    # Aplicamos la funcion para buscar la raiz de las palabras
    titular=[stemmer.stem(palabra) for palabra in titular]
    # Por ultimo volvemos a unir el titular
    titular=" ".join(titular)
    return titular

In [14]:
df_final["texto_stem"]=df_final.Texto.apply(normalizar_y_stemmizar)

As we can see we have a stemmed text in which we have applied the function from before

In [15]:
df_final

Unnamed: 0,Personaje,Texto,texto_stem
0,Sheldon,So if a photon is directed through a plane wi...,photon direct plane slit either slit observ sl...
1,Sheldon,"Since it’s not bee season, you can have my ep...",sinc season epinephrin need chopstick thai foo...
2,Sheldon,Good lord! Lock and load. I’ve got the Sword...,good lord lock load sword azeroth sheldon swor...
3,Sheldon,I’ve been thinking about time travel again. P...,think time travel back burner anyway occur eve...
4,Sheldon,"Alright, I’m moving my infantry division, aug...",alright move infantri divis augment battalion ...
...,...,...,...
1150,Penny,"Hey, Leonard, if you’re not busy tomorrow, I ...",leonard busi tomorrow littl recept work sure c...
1151,Penny,"Oh, wait. I remember. Yes. Please find someon...",wait rememb pleas find someon care alreadi how...
1152,Penny,I didn’t know you could drink while you’re br...,know could drink breastfeed gonna anoth want f...
1153,Penny,"Champagne, champagne. And for the world’s tal...",champagn champagn world tallest second grader ...


The next step would be to vectorize in order to have the words in binary format where they represent whether they are present or not.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

max_features=1000 

cou_vec=CountVectorizer(max_features=max_features)



In [17]:
matriz_titulos = cou_vec.fit_transform(df_final["texto_stem"]) # le pasamos una columna de texto

# Tomamos las palabras
all_words = cou_vec.get_feature_names_out()

# Vizualizamos las 50 primeras palabras
print("50 primeras mas usadas: ",all_words[0:50])

50 primeras mas usadas:  ['abl' 'absolut' 'accept' 'across' 'act' 'action' 'activ' 'actress'
 'actual' 'admit' 'advic' 'afraid' 'agre' 'agreement' 'ahead' 'alcohol'
 'alien' 'allow' 'almost' 'alon' 'along' 'alreadi' 'alright' 'also'
 'although' 'alway' 'amaz' 'america' 'american' 'angri' 'anim' 'annoy'
 'anoth' 'answer' 'anybodi' 'anymor' 'anyon' 'anyth' 'anyway' 'apart'
 'apolog' 'appar' 'appear' 'appli' 'appreci' 'around' 'ask' 'assum'
 'astronaut' 'atom']


In [18]:
matriz_titulos.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 5, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
# x que sea texto_stem 

y= df_final["Personaje"]
x=matriz_titulos.toarray()


In [21]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
rf=RandomForestClassifier()
rf.fit(xtrain,ytrain)
y_pred=rf.predict(xtest)

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

In [25]:
ypred=rf.predict(xtest)
print(accuracy_score(ytest, ypred))


0.6623376623376623


By having unbalanced classes we will look at the classification report to take into account the metrics

Let's briefly define the metrics 

***Precision***: Precision refers to the ratio of correct positive predictions (true positives) to the total positive predictions made by the model (true positives plus false positives). 

***Recall***: The recall (or sensitivity) measures the proportion of true positives correctly identified by the model with respect to the total true positives in the data.


***F1-score***: The F1-score is the harmonic mean of precision and recall. It provides a balance between precision and recall and is useful when classes are unbalanced. 

In [26]:
from sklearn.metrics import classification_report


print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

      Howard       0.54      0.41      0.47        46
     Leonard       0.57      0.61      0.59        46
       Penny       0.65      0.74      0.69        46
         Raj       0.61      0.66      0.63        47
     Sheldon       0.93      0.89      0.91        46

    accuracy                           0.66       231
   macro avg       0.66      0.66      0.66       231
weighted avg       0.66      0.66      0.66       231



What do these metrics mean?

A ***precision*** of 0.92 for Sheldon, for example, means that 92% of the times the model predicted that a text belonged to Sheldon, it actually belonged to Sheldon.

A ***recall*** of 0.41 for Howard indicates that 41% of all texts that actually belonged to Howard were correctly identified by the model as belonging to Howard.

A ***F1-score*** of 0.47 for Howard indicates a combination of precision and recall performance for that particular class.

# Concluding remarks

Metrics vary by class. For example, Sheldon has a significantly higher F1-score compared to Howard. This could indicate that the model has an easier time distinguishing Sheldon from other classes.

The overall accuracy is 0.67, suggesting that the model is correctly classifying 67% of the examples in the test set. This is consistent with the accuracy score reported above.