# Logistic Regression using IMDB review comments
 - DATA:https://www.kaggle.com/competitions/word2vec-nlp-tutorial/overview

In [None]:
import pandas as pd
import numpy as np
import torch
import sklearn
import torch.nn.functional as F

In [None]:
df = pd.read_csv("labeledTrainData.tsv", sep = '\t')[:1000]

In [None]:
df.head(2)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."


In [None]:
df["sentence"] = df["review"].apply(lambda x: len(x.split(".")))
#df["content"].apply(lambda x: len(x.split("."))).value_counts()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = df["review"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [None]:
len(vectorizer.get_feature_names_out())

18361

In [None]:
X

<1000x18361 sparse matrix of type '<class 'numpy.float64'>'
	with 137788 stored elements in Compressed Sparse Row format>

In [None]:
df["content_vector"]= X.todense().tolist()

In [None]:
df.head(2)

Unnamed: 0,id,sentiment,review,sentence,content_vector
0,5814_8,1,With all this stuff going down at the moment w...,21,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",17,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
class MyClassificationModel(torch.nn.Module):
  def __init__(self, num_feature, num_out):
    super().__init__()
    self.linear = torch.nn.Linear(in_features=num_feature, out_features= num_out)
    self.sm = torch.nn.Sigmoid()

  def forward(self, x):
    z = self.linear(x)
    result = self.sm(z)

    return  result

In [None]:
myLin = MyClassificationModel(len(vectorizer.get_feature_names_out()), 1)

In [None]:
optimizer = torch.optim.SGD(myLin.parameters(), lr=0.1)

In [None]:
nb_epochs = 10
for epoch in range(nb_epochs + 1):
  total_loss = 0
  for idx in range(len(df)):
    x_train = np.array([df["content_vector"][idx]])
    y_train = np.array([[df["sentiment"][idx]]])
    x_train = torch.from_numpy(x_train).float()
    y_train = torch.from_numpy(y_train).int()
    hypothesis = myLin(x_train)
    #print(hypothesis)
    # cost 계산
    cost = F.binary_cross_entropy(hypothesis.to(torch.float32), y_train.to(torch.float32))
    #print(cost.item())
    total_loss = total_loss + cost.item()

    # cost로 H(x) 개선 
    optimizer.zero_grad() 
    cost.backward() 
    optimizer.step()
  print(epoch, total_loss / len(df))

0 0.6282594498991966
1 0.5892519999742508
2 0.5553338939547539
3 0.5255140135586261
4 0.49902962225675584
5 0.47529961360991
6 0.4538754263371229
7 0.43440570805966855
8 0.4166111406087875
9 0.4002664915509522
10 0.38518775084242224


## 실습: Accuracy를 계산하는 삼수를 구현하여 학습된 모델의 성능을 평가하시오