<a href="https://colab.research.google.com/github/emilianog4/Data-Science-Projects/blob/main/Scikit_learn_pruebas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Utilizar Kaggle en google colab

In [None]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! mkdir ~/.kaggle

In [3]:
!cp /content/drive/MyDrive/colab/kaggle.json ~/.kaggle/kaggle.json

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

## Extrayendo el dataset de IMDB movie reviews de kaggle.com

In [5]:
! kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
  0% 0.00/25.7M [00:00<?, ?B/s] 66% 17.0M/25.7M [00:00<00:00, 175MB/s]
100% 25.7M/25.7M [00:00<00:00, 188MB/s]


# # abriendo el archivo .zip que contiene el archivo .csv 

In [6]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        




---



---


# **Importando librerias**

In [7]:
import pandas as pd

# **leo el dataset IMDB**

In [8]:
df_review = pd.read_csv('IMDB Dataset.csv')

In [9]:
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [10]:
df_positive = df_review[df_review['sentiment']=='positive'][:9000]
df_negative = df_review[df_review['sentiment']=='negative'][:1000]

In [11]:
df_review_des = pd.concat([df_positive, df_negative])
df_review_des.value_counts('sentiment')

sentiment
positive    9000
negative    1000
dtype: int64

# Dataset Desbalanceado

In [12]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
df_review_bal, df_review_bal['sentiment'] = rus.fit_resample(df_review_des[['review']], df_review_des['sentiment'])
df_review_bal.value_counts(['sentiment'])

sentiment
negative     1000
positive     1000
dtype: int64

## Separando data para entrenar (train) y testear (test)

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42)

In [14]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

# Representacion de Text (Bag of words)

# Transformar data de texto a data numerica

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
test_x_vector = tfidf.transform(test_x)

In [16]:
train_x_vector

<1340x20578 sparse matrix of type '<class 'numpy.float64'>'
	with 116957 stored elements in Compressed Sparse Row format>

# Support Vector Machines (SVM)

In [17]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(train_x_vector, train_y)

## Testeo

In [18]:
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['I did not like this movie at all I gave this movie away'])))

['positive']
['positive']
['negative']


# Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_x_vector, train_y)

# Naive Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_y)

# Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x_vector, train_y)

# Evaluacion del modelo

## Precision del modelo

In [26]:
print('SVC: ', svc.score(test_x_vector, test_y))
print('Decision Tree: ', dec_tree.score(test_x_vector, test_y))
print('Gaussian Naive Bayes: ', gnb.score(test_x_vector.toarray(), test_y))
print('Logistic Regression: ', lr.score(test_x_vector, test_y))

SVC:  0.8348484848484848
Decision Tree:  0.6787878787878788
Gaussian Naive Bayes:  0.5878787878787879
Logistic Regression:  0.8333333333333334


# F1 Score

In [27]:
# F1 Score = 2 * (Recall * Precision) / (Recall + Precision)
from sklearn.metrics import f1_score

f1_score(test_y, svc.predict(test_x_vector), 
         labels=['positive', 'negative'],
         average=None)

array([0.83803863, 0.83153014])

# Reporte de clasificacion

In [29]:
from sklearn.metrics import classification_report

print(classification_report(test_y, svc.predict(test_x_vector),
                      labels=['positive', 'negative']))

              precision    recall  f1-score   support

    positive       0.83      0.84      0.84       335
    negative       0.84      0.83      0.83       325

    accuracy                           0.83       660
   macro avg       0.83      0.83      0.83       660
weighted avg       0.83      0.83      0.83       660



# Confusion Matrix

In [30]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test_y, svc.predict(test_x_vector),
                 labels=['positive', 'negative'])

array([[282,  53],
       [ 56, 269]])

# Optimizacion del modelo