In [None]:
from google.colab import drive
drive.mount('/content/drive')


import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/sem3ai/Copy of IMDB Dataset.csv')
df.columns


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Index(['review', 'sentiment'], dtype='object')

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

df['review'] = df['review'].apply(preprocess_text)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = classification_report(y_test, y_pred, output_dict=True)['1'].values()
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print("Accuracy : " , accuracy)
print("Precision : " , precision , "\nRecall : ", recall, "\nF1-score : " , f1)
print("Confusion Matrix : \n" , conf_matrix)
print("ROC-AUC Score : " , roc_auc)

Accuracy :  0.8542
Precision :  0.8690991548134406 
Recall :  0.8366739432427069 
F1-score :  0.8525783619817998
Confusion Matrix : 
 [[4326  635]
 [ 823 4216]]
ROC-AUC Score :  0.9230088558587904


In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np

data = load_breast_cancer()
X = data.data
y = data.target

In [None]:
#convert to dataframe
df = pd.DataFrame(X, columns=data.feature_names)
df['diagnosis'] = y

print(df.describe())
print(df.isnull().sum())

       mean radius  mean texture  mean perimeter    mean area  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630          0.019380        0.000000             0.000000   
25%      

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
X = scaler.fit_transform(X)
#split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

model = LogisticRegression(max_iter=200)
rfe = RFE(estimator=model, n_features_to_select=5)
rfe.fit(X_train, y_train)

In [None]:
selected_features = np.where(rfe.support_)[0]
print(f"Selected Feature Indices: {selected_features}")

#transform dataset with selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)
model.fit(X_train_rfe, y_train)

Selected Feature Indices: [10 20 21 23 27]


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

y_pred = model.predict(X_test_rfe)
y_prob = model.predict_proba(X_test_rfe)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = classification_report(y_test, y_pred, output_dict=True)['1'].values()
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

In [None]:
print("Accuracy : ", accuracy)
print("Precision : ", precision, "\nRecall : ", recall, "\nF1-score : ", f1)
print("Confusion Matrix : \n", conf_matrix)
print("ROC-AUC Score : ", roc_auc)

Accuracy :  0.9736842105263158
Precision :  0.9722222222222222 
Recall :  0.9859154929577465 
F1-score :  0.9790209790209791
Confusion Matrix : 
 [[41  2]
 [ 1 70]]
ROC-AUC Score :  0.9977071732721914


In [None]:
feature_counts = [3, 7]
for n_features in feature_counts:
    rfe = RFE(estimator=model, n_features_to_select=n_features)
    rfe.fit(X_train, y_train)
    X_train_rfe = rfe.transform(X_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe, y_train)
    y_pred = model.predict(X_test_rfe)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy with top", n_features, "features:", accuracy)

Accuracy with top 3 features: 0.9649122807017544
Accuracy with top 7 features: 0.9736842105263158
