# Kategoryzowanie tekstu za pomocą regresji logistycznej i lasów losowych

## Przygotowanie bibliotek

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import sklearn.metrics

## Przygotowanie danych

### Wczytywanie i oczyszczanie danych

In [0]:
dataset_path = 'https://drive.google.com/uc?export=download&id=1J6x_0I61-_J48cpKGaLWXaIEeJm-0gXs'

def loadDataset(csv_path = dataset_path):
  return pd.read_csv(csv_path, encoding='latin-1')

In [0]:
dataset = loadDataset()
dataset.drop(labels=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"],axis = 1, inplace=True)

In [0]:
dataset

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Zmiana formy danych na *bag of words* (worek słów)

In [0]:
def tokenize(sentences):
  words = []
  for sentence in sentences:
    words.extend(sentence.split())
    words = sorted(list(set(words)))
  return words

vocabulary = tokenize(dataset["v2"])

In [0]:
dataset_csv = dataset

In [0]:
dataset = []
for i in range(len(dataset_csv)):
  sentence = dataset_csv["v2"][i];
  bag_of_words = []
  for word in vocabulary:
    bag_of_words.extend([sentence.split().count(word)])
  dataset.append([bag_of_words,dataset_csv["v1"][i]])

### Rozdzielenie danych na zbiór treningowy, testowy i walidacyjny



In [0]:
import random

def split_dataset(train_percent,test_percent):
  train = math.floor(train_percent*len(dataset))
  test = math.floor(test_percent*len(dataset))
  random.shuffle(dataset)
  return [dataset[:train],dataset[train:train+test],dataset[train+test:]]

In [0]:
dataset_train, dataset_test, dataset_validation = split_dataset(0.7,0.2)

In [0]:
dictionary = {"ham":0, "spam":1}

dataset_train_X, dataset_train_y = [],[]
dataset_test_X, dataset_test_y = [],[]
dataset_validation_X, dataset_validation_y = [],[]

for i in range(len(dataset_train)):
  dataset_train_X.append(dataset_train[i][0])
  dataset_train_y.append(dictionary[dataset_train[i][1]])

for i in range(len(dataset_test)):
  dataset_test_X.append(dataset_test[i][0])
  dataset_test_y.append(dictionary[dataset_test[i][1]])

for i in range(len(dataset_validation)):
  dataset_validation_X.append(dataset_validation[i][0])
  dataset_validation_y.append(dictionary[dataset_validation[i][1]])

## Model regresji logistycznej

In [0]:
import sklearn.linear_model as sk

model = sk.LinearRegression(normalize=True)

model.fit(np.asarray(dataset_train_X),dataset_train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [0]:
model.score(np.asarray(dataset_test_X),dataset_test_y)

0.7562260946070523

## Model lasów losowych

In [0]:
import sklearn.ensemble

model = sklearn.ensemble.RandomForestClassifier()

model.fit(np.asarray(dataset_train_X),dataset_train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
model.score(np.asarray(dataset_test_X),dataset_test_y)

0.9605026929982047

# Przygotowanie własnego modelu regresji logistycznej (tymczasowo nie działające)

In [0]:
class LogisticalRegressor:
  
  def __init__(self):
    self.W = []

  def sigmoid(self, x):
    # Liczy wartość funkcji sigmoidalnej z x
    return 1/(1+math.exp(-np.sum(x)))

  def reverse_sigmoid(self, x):
    # Oblicza wartość argumentu funkcji sigmoidalnej dla znanej wartości (x)
    return -math.log((1/x)-1)

  def predict(self, x):
    print(x.shape)
    return [self.sigmoid(np.dot(np.asarray(self.W),np.asarray(np.transpose(x)[i]))) for i in range(len(np.transpose(x))-1)]

  def evaluate(self,X,y,error_function = sklearn.metrics.log_loss):
    # Oblicza poprawność modelu na daym zbiorze danych
    # Nie działa jeśli funkcja fit nie została wywołała wcześniej
    # Używana funkcja błędu to domyślnie błąd logarytmiczny, a.k.a. entropia
    return error_function(y,self.predict(X))
  
  def fit(self, X,y, steps=1000, step_size=0.003 , batch_size=10, error_function = sklearn.metrics.log_loss):
    self.W = np.random.normal(size=len(X[0])+1)
    print(self.W.shape)
    print(len(X))
    if(type(X)==list):
      X = pd.DataFrame(X) 
      y = pd.DataFrame(y)
    for step in range(steps):
      idx = np.random.permutation(X.index)
      X.reindex(idx)
      y.reindex(idx)
      X_batch = X[:batch_size]
      y_batch = y[:batch_size]

      y_pre = self.predict(X_batch)
      y_pred = reverse_sigmoid(y_pre)                       # = W*X (dla prostszego liczenia gradientu)
      y_true = reverse_sigmoid(y_batch)                     # = W_true * X
      loss = error_function(y_batch, y_pred)
      if(steps%100==0):
        print("Step: ", steps, ", loss = ",loss)
      cost = np.sum(loss**2)/2*batch_size
      gradient = np.dot(X_batch.transpose,loss)/batch_size
      self.W = self.W - step_size*gradient

## Ucznie i testowanie modelu

### Uczenie modelu

In [0]:
model = LogisticalRegressor()

In [0]:
model.fit(dataset_train_X,dataset_train_y)

(15586,)
3900


KeyboardInterrupt: ignored

In [0]:
model.W[:-1] @ np.asarray(dataset_train_X[0])

0.19819472202621746

### Testowanie na całym zbiorze danych oraz na zbiorze walidacyjnym

In [0]:
len(dataset_train_X[0])

15585