In [None]:
import pandas as pd
from gensim import models
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

!unzip archive.zip

In [None]:
df = pd.read_csv('simpsons_script_lines.csv',dtype = 'unicode')

In [None]:
df.head(1)

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464,3,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31


In [None]:
sentences =  [x.split(' ') for x in df['normalized_text'].fillna('').tolist() ]

In [None]:
model = models.Word2Vec()
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

(4830340, 6661320)

In [None]:
most_frequent_words = model.wv.index_to_key[:1000]

In [None]:
most_frequent_words[:10]

['the', 'you', 'i', 'a', 'to', '', 'and', 'of', 'it', 'that']

In [None]:
embeddings = [model.wv[x] for x in most_frequent_words]

In [None]:
 model.wv.most_similar('the',topn=10)

[('springfield', 0.6006653308868408),
 ('america', 0.5962285995483398),
 ('world', 0.5670474171638489),
 ('every', 0.5252284407615662),
 ('which', 0.5117892026901245),
 ('our', 0.5092774033546448),
 ('top', 0.5027733445167542),
 ('peace', 0.5016379356384277),
 ('east', 0.5014795064926147),
 ('state', 0.49849972128868103)]

In [None]:
embedding_clusters = []
word_clusters = []
for word in most_frequent_words:
    embeddings = []
    words = []
    for similar_word, _ in model.wv.most_similar(word, topn=5):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:

tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=500, random_state=32)
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
embeddings_en_2d [0]

array([[-39.52971  ,   7.4671264],
       [-36.03111  ,   9.167298 ],
       [-35.74734  ,  29.651346 ],
       [-39.994812 ,  -5.2254033],
       [  3.3246064,  33.024036 ]], dtype=float32)

In [None]:
import plotly.express as px
import plotly.graph_objs as go

In [None]:
def tsne_plot_similar_words(labels, embedding_clusters, word_clusters, a=0.7):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:,0]
        y = embeddings[:,1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)

    plt.grid(True)
    plt.show()


In [None]:
def plot(labels, embedding_clusters, word_clusters):
    colors =list(range(len(labels)))
    color_scale = px.colors.sequential.Plasma
    # Create a scatter plot
    fig = go.Figure()
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        fig.add_trace(go.Scatter(
            x=x,
            y=y,
            mode='markers',
            marker=dict(
                size=8,
                color=color,
                colorscale=color_scale,
                opacity=0.5,
            ),
            text=words,
            hovertemplate='Words: %{text}<br>(x,y)= (%{x:.2f}, %{y:.2f})',
            name=label
        ))

# Set layout properties
    fig.update_layout(
        showlegend=True,
        title='Scatter Plot with Clusters',
        width=800,
        height=450,
        hoverlabel=dict(
            bgcolor="white",
            font_size=12,
            font_family="Arial"
        )
    )

    fig.show()

In [None]:
plot(most_frequent_words, embeddings_en_2d, word_clusters)

In [None]:
model.wv.most_similar(positive=['homer','bart'], negative=['marge'])

[('lisa', 0.7585029006004333),
 ('grampa', 0.6689010262489319),
 ('abe', 0.639664888381958),
 ('maggie', 0.5964199900627136),
 ('milhouse', 0.575554609298706),
 ('mrs', 0.5718148350715637),
 ('son', 0.5607784390449524),
 ('dad', 0.5337631702423096),
 ('mr', 0.5313624143600464),
 ('moe', 0.5113430619239807)]

In [None]:
model.wv.most_similar(positive=['bart','school'], negative=['lisa'])

[('church', 0.7809023261070251),
 ('store', 0.6747639775276184),
 ('game', 0.6718762516975403),
 ('summer', 0.655957818031311),
 ('town', 0.6495531797409058),
 ('weekend', 0.6431702375411987),
 ('house', 0.6370514035224915),
 ('plant', 0.6352666616439819),
 ('party', 0.6331039667129517),
 ('bar', 0.6286707520484924)]

In [None]:
model.wv.most_similar(positive=['marge','home'], negative=['homer'])

[('back', 0.7484268546104431),
 ('bed', 0.6436793208122253),
 ('sleep', 0.629082441329956),
 ('dinner', 0.5389255285263062),
 ('car', 0.5374376773834229),
 ('together', 0.5238505601882935),
 ('here', 0.522312581539154),
 ('away', 0.5201205611228943),
 ('fruition', 0.5124994516372681),
 ('school', 0.5111822485923767)]

# Классификатор

In [None]:
char = pd.read_csv('simpsons_characters.csv')
char[(char['name']=='Lisa Simpson')|(char['name']=='Bart Simpson')]

Unnamed: 0,id,name,normalized_name,gender
71,8,Bart Simpson,bart simpson,m
72,9,Lisa Simpson,lisa simpson,f


In [None]:
data = df[(df['character_id']=='8')|(df['character_id']=='9')]

In [None]:
data.loc[data['character_id']=='8','character_id']=1
data.loc[data['character_id']=='9','character_id']=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data['character_id']=='8','character_id']=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data['character_id']=='9','character_id']=0


In [None]:
data = data[~data['normalized_text'].isna()]

In [None]:
data.head(1)

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,0,3,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3


In [None]:
sentences =  [x.split(' ') for x in df['normalized_text'].fillna('').tolist() ]

In [None]:
model = models.Word2Vec(vector_size=200)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

(4829369, 6661320)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['normalized_text'], data['character_id'], test_size=0.2, random_state=42)

In [None]:
def vectorize(sentence):
    try:
        words = sentence.split()
    except:
        print(sentence)
    words_vecs = [model.wv[word] for word in words if word in model.wv]
    if len(words_vecs) == 0:
        return np.zeros(200)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [None]:
X_train = np.array([vectorize(sentence) for sentence in X_train])
X_test = np.array([vectorize(sentence) for sentence in X_test])

In [None]:
X_train = torch.from_numpy(X_train).float()

In [None]:
X_test= torch.from_numpy(X_test).float()

In [None]:
y_test = torch.from_numpy(np.array(y_test.astype(int).tolist()))

In [None]:
y_train = torch.from_numpy(np.array(y_train.astype(int).tolist()))

In [None]:
import os
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [None]:
class Deep(nn.Module):
    def __init__(self,in_=4000,rep=3,blocks = 3):
        super().__init__()
        self.rep = rep
        self.blocks = blocks
        self.drop1 = nn.Dropout(0.5)
        self.layer1 = nn.Linear(200, in_*2)
        self.act1 = nn.ReLU()
        self.layers_1 = []
        self.act_1 = []

        for block in range(blocks):
            for i,layer in enumerate(range(rep)):
                if i == 0:
                    self.layers_1.append(nn.Linear(in_*2, in_))
                else:
                    self.layers_1.append(nn.Linear(in_, in_))

                self.act_1.append(nn.ReLU())
            in_ = int(in_/2)
        self.layers_1 =  nn.ModuleList( self.layers_1)
        self.act_1 =  nn.ModuleList( self.act_1)

        self.drop2 = nn.Dropout(0.5)
        self.layer2 = nn.Linear(in_*2, in_)
        self.norm = nn.BatchNorm1d(in_)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(in_, 200)
        self.act3 = nn.ReLU()
        self.drop3 = nn.Dropout(0.5)

        self.output = nn.Linear(200, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.layer1(self.drop1(x)))
        j = 0
        for _ in range(self.blocks):
            for i,layer in enumerate(range(self.rep)):
                if i == 0:
                    x = self.act_1[j](self.layers_1[j](F.dropout(x,training=self.training)))
                else:
                     x = self.act_1[j](self.layers_1[j](x))
                j+=1
        x = self.act2(self.norm(self.layer2(x)))
        x = self.act3(self.layer3(x))

        x = self.sigmoid(self.output(self.drop3(x)))
        return x

In [None]:
import time
from  torch.optim.lr_scheduler import StepLR
from torch.autograd import Variable
model = Deep()
model.to('cuda')
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCELoss()
X_train, y_train = X_train.to('cuda'), y_train.to('cuda')
X_test, y_test = X_test.to('cuda'), y_test.to('cuda')

In [None]:
len(X_train)

19273

In [None]:
from torch.utils.data import DataLoader, TensorDataset
dataloader = DataLoader(TensorDataset(X_train, y_train), batch_size=5000, shuffle=True)

In [None]:

for ep in range(5000):
    start = time.time()
    train_loss = 0.
    train_passed = 0
    for X,y in dataloader:
        X,y = X.to('cuda'),y.to('cuda')
        model.train()
        optimizer.zero_grad()
        y_pred = model(X)
        loss = criterion(y_pred, y.reshape(-1,1).float())
        loss.backward()
        optimizer.step()


    if ep %10 == 0:
        print(loss)
        model.eval()
        y_pred = model(X_train)
        y_pred = torch.round(y_pred)
        print('Train AUC ',(y_pred.to('cpu').flatten()  == y_train.to('cpu') ).sum().item()/len(y_train))
        y_pred = model(X_test)
        y_pred = torch.round(y_pred)
        print('Test AUC ',(y_pred.to('cpu').flatten()  == y_test.to('cpu') ).sum().item()/len(y_test))


tensor(0.6946, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Train AUC  0.45255019976132416
Test AUC  0.4505084042332434
tensor(0.6737, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Train AUC  0.5989726560473201
Test AUC  0.5934841253372068
tensor(0.6623, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Train AUC  0.6092979816323354
Test AUC  0.5862212077194439
tensor(0.6589, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Train AUC  0.6107507912623878
Test AUC  0.5758456111226395
tensor(0.6515, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Train AUC  0.6321797333056608
Test AUC  0.5938991492010791
tensor(0.6409, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Train AUC  0.6224770404192393
Test AUC  0.5681676696410044
tensor(0.6423, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Train AUC  0.6627925076531936
Test AUC  0.6038597219340112
tensor(0.6344, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
Train AUC  0.66045

KeyboardInterrupt: 