# 1. LogisticRegression_BinaryClassification_NoDataLoader (IMDB Dataset)
 - DATA:https://www.kaggle.com/competitions/word2vec-nlp-tutorial/overview

##### 0) 라이브러리 불러오기

In [19]:
# (1) 기본 라이브러리
import pandas as pd
import numpy as np

# (2) 머신러닝 라이브러리
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn.functional as F

##### 1) 데이터 불러오기

In [20]:
df = pd.read_csv("./data/labeledTrainData.tsv", sep = '\t')[:1000]

In [21]:
df.head(2)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."


##### 2) 문장 개수 추출

In [22]:
df["sentence"] = df["review"].apply(lambda x: len(x.split(".")))
#df["content"].apply(lambda x: len(x.split("."))).value_counts()

##### 3) Corpus 추출

In [23]:
corpus = df["review"]

##### 4) TF-IDF 추출

In [24]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [25]:
len(vectorizer.get_feature_names_out())

18361

In [26]:
X

<1000x18361 sparse matrix of type '<class 'numpy.float64'>'
	with 137788 stored elements in Compressed Sparse Row format>

##### 5) Content Vector 추출

In [27]:
df["content_vector"] = X.todense().tolist()

In [28]:
df.head(2)

Unnamed: 0,id,sentiment,review,sentence,content_vector
0,5814_8,1,With all this stuff going down at the moment w...,21,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",17,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


##### 6) 모델 정의

In [29]:
class MultivariableClassificationModel(torch.nn.Module):
  def __init__(self, num_feature, num_out):
    super().__init__()
    self.linear = torch.nn.Linear(in_features=num_feature, out_features= num_out)
    self.sm = torch.nn.Sigmoid()

  def forward(self, x):
    z = self.linear(x)
    result = self.sm(z)

    return  result

myLin = MultivariableClassificationModel(len(vectorizer.get_feature_names_out()), 1)

##### 7) Optimizer 정의

In [30]:
optimizer = torch.optim.SGD(myLin.parameters(), lr=0.1)

##### 8) 모델 학습 + 성능 평가 (과제)

In [31]:
# (1) Epoch 순회
nb_epochs = 10
for epoch in range(nb_epochs + 1):
  total_loss = 0
  for idx in range(len(df)):
    # 1] train 데이터 불러오기
    x_train = np.array([df["content_vector"][idx]])
    y_train = np.array([[df["sentiment"][idx]]])
    x_train = torch.from_numpy(x_train).float()
    y_train = torch.from_numpy(y_train).int()
    # 2] 모델 예측
    y_pred = myLin(x_train)
    # 3] Cost 계산
    cost = F.binary_cross_entropy(y_pred.to(torch.float32), y_train.to(torch.float32))
    # print(cost.item())
    # 4] Loss 계산
    total_loss = total_loss + cost.item()

    # 5] 예측값 = 최대 확률
    # print(y_pred)
    prediction = torch.argmax(y_pred, dim=1) # !!!
    # 6] Accuracy 계산
    check_prediction = prediction == y_train
    # print(f"len(check_prediction), check_prediction.sum().item() : {len(check_prediction), check_prediction.sum().item()}")
    accuracy = check_prediction.sum().item() / len(check_prediction) # !!!

    # 7] 모델 역전파
    optimizer.zero_grad() 
    cost.backward() 
    optimizer.step()

    # 8] 결과 출력
    print('Epoch {:4d}/{} Batch {}/{} Cost:{:.6f} Accruacy:{:.6f}'.format(epoch, nb_epochs, idx+1, len(df), cost.item(), accuracy * 100)) #  hypothesis: {} , pred.squeeze().detach()

Epoch    0/10 Batch 1/1000 Cost:0.698195 Accruacy:0.000000
Epoch    0/10 Batch 2/1000 Cost:0.666897 Accruacy:0.000000
Epoch    0/10 Batch 3/1000 Cost:0.747070 Accruacy:100.000000
Epoch    0/10 Batch 4/1000 Cost:0.716825 Accruacy:100.000000
Epoch    0/10 Batch 5/1000 Cost:0.702143 Accruacy:0.000000
Epoch    0/10 Batch 6/1000 Cost:0.667069 Accruacy:0.000000
Epoch    0/10 Batch 7/1000 Cost:0.746907 Accruacy:100.000000
Epoch    0/10 Batch 8/1000 Cost:0.712318 Accruacy:100.000000
Epoch    0/10 Batch 9/1000 Cost:0.684168 Accruacy:100.000000
Epoch    0/10 Batch 10/1000 Cost:0.729121 Accruacy:0.000000
Epoch    0/10 Batch 11/1000 Cost:0.686572 Accruacy:100.000000
Epoch    0/10 Batch 12/1000 Cost:0.725310 Accruacy:0.000000
Epoch    0/10 Batch 13/1000 Cost:0.698750 Accruacy:0.000000
Epoch    0/10 Batch 14/1000 Cost:0.714637 Accruacy:100.000000
Epoch    0/10 Batch 15/1000 Cost:0.689523 Accruacy:100.000000
Epoch    0/10 Batch 16/1000 Cost:0.663683 Accruacy:100.000000
Epoch    0/10 Batch 17/1000 Cos