# Train and test split

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

svm is a ml model for training

In [None]:
dt = pd.read_csv('/content/diabetes.csv')

In [None]:
dt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
dt.shape

(768, 9)

In [None]:
dt['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


In [None]:
dt.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [None]:
# separating features and target
x = dt.drop(columns='Outcome', axis=1)
y=dt['Outcome']

In [None]:
# applying standard scaler fxn and fitting it according to x
scaler = StandardScaler()
scaler.fit(x)
stand_dt = scaler.transform(x)
print(stand_dt)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [None]:
stand_dt.std()

1.0

In [None]:
#splitting data to test and train data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)
x_train.shape

(614, 8)

# Handling imbalanced dataset

imbalanced dataset contains very large difference in number of diabetic cases and non diabetic cases which will train the model very badly

In [None]:
cred = pd.read_csv('/content/credit_data.csv')

In [None]:
cred.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


class =0/1 denotes legit or fraud transactions so this is a basic classification model

In [None]:
#distribution of 2 classes - 0 and 1 and determine if balanced or not
cred['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,213570
1.0,398


so this is a highly imbalanced dataset where 0=legit
1=fraud transactions

In [None]:
#separating the legit and fraudulent transactions
legit = cred[cred.Class==0]
fraud = cred[cred['Class']==1]

In [None]:
legit.shape

(213570, 31)

In [None]:
fraud.shape

(398, 31)

UNDERSAMPLING

sampling method to handle imbalanced data

we will take only ~398 datapoints from legit too

In [None]:
#build a dataset cont similar distribution of fraud and legit
legit_sample = legit.sample(n=398)

this will give a pandas dataframe

In [None]:
legit_sample.shape

(398, 31)

In [None]:
#concatenating the 2 datasets
new_dt = pd.concat([legit_sample,fraud],axis=0)
#axis=0 means we need to concatenate 1 dataset on top of the other

In [None]:
new_dt.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
106027,69788.0,-3.368565,1.606637,0.942707,-0.008758,-2.068291,0.124858,-1.595843,2.202336,-0.183463,...,0.316589,0.898757,0.086629,0.257176,0.327018,0.559134,0.321843,-0.199958,26.3,0.0
74168,55453.0,-0.293687,-2.768153,0.330228,0.777695,-1.495443,1.153841,-0.000932,0.332734,0.866235,...,0.297829,-0.487819,-0.48923,-0.195375,-0.316461,0.878908,-0.139438,0.120327,716.61,0.0
87994,61934.0,1.125762,0.233302,0.746772,1.131157,-0.482845,-0.705724,0.071169,-0.193641,-0.019158,...,-0.149508,-0.472764,0.056499,0.38215,0.328838,-0.632459,0.037346,0.045418,49.99,0.0
163771,116198.0,-0.83715,0.073428,1.280361,-2.745319,-0.18944,-0.071674,-0.011769,-0.089642,1.962355,...,0.197235,0.655321,0.025938,0.695158,-0.871297,-0.550137,-0.612679,-0.000446,2.55,0.0
87407,61674.0,1.241774,-1.20914,1.595206,0.004126,-2.123171,0.211764,-1.518982,0.283992,0.923581,...,-0.552148,-0.716168,0.075398,0.436751,0.10493,1.075899,0.016891,0.021507,22.5,0.0


In [None]:
new_dt.tail()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
208651,137211.0,0.630579,1.183631,-5.066283,2.179903,-0.703376,-0.103614,-3.49035,1.094734,-0.717418,...,0.621622,0.043807,0.102711,-0.601505,0.127371,-0.163009,0.853792,0.356503,39.45,1.0
212516,138894.0,-1.298443,1.9481,-4.509947,1.305805,-0.019486,-0.509238,-2.643398,1.283545,-2.515356,...,1.178032,1.360989,-0.272013,-0.325948,0.290703,0.841295,0.643094,0.201156,0.01,1.0
212644,138942.0,-2.356348,1.74636,-6.374624,1.772205,-3.439294,1.457811,-0.362577,1.443791,-1.927359,...,0.857942,0.621203,0.964817,-0.619437,-1.732613,0.108361,1.130828,0.415703,727.91,1.0
213092,139107.0,-4.6665,-3.95232,0.206094,5.153525,5.229469,0.93904,-0.635033,-0.704506,-0.234786,...,-0.664263,1.821422,0.113563,-0.759673,-0.502304,0.630639,-0.51388,0.729526,22.47,1.0
213116,139117.0,-3.975939,-1.244939,-3.707414,4.544772,4.050676,-3.407679,-5.063118,1.007042,-3.190158,...,1.059737,-0.037395,0.348707,-0.162929,0.410531,-0.123612,0.877424,0.667568,8.3,1.0


In [None]:
new_dt['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,398
1.0,398


# Feature Extraction

the mapping from textual data to real valued vectors is called feauture extraction

Bag of words(bow) = list of unique words in text corpus

Term freq- inverse document frequency(TF_IDF) = to count the number of times each word appears in a document




 term freq(tf) = no of times term t appears in doc/number of terms in doc
 [tells us which word is imp]

 inverse doc freq(idf) = log(N/n) N=no of docs
 n=no of docs the terms t has appeared in
 [IDF of rare word is high and of a freq word is low]

  
WORDS LIKE IS ARE WAS are used a lot of times so we dont want to give them a lot of focus so we multiply tf and idf to give the most important words

TF IDF VALUE = TF*IDF




In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf-idf

In [None]:
# converting textual data to feature vector
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x=vectorizer.transform(x)
print(x)

this prints the transformed data

# NUmerical dataset processing

In [None]:
import numpy as np
import pandas as pd
dt= pd.read_csv('/content/diabetes.csv')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
dt.shape

(768, 9)

In [None]:
dt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
dt.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


dividing into features and targets and then into train and test

In [None]:
x = dt.drop(columns = 'Outcome',axis=1)
y = dt['Outcome']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

data stand can be done before or after split but in sirs view, std should be done before as otherwise the data will loose some of its part

data standardizing

In [None]:
scaler = StandardScaler()
#this is will load the standardscaler fxn into scaler variable
stdt=scaler.fit_transform(x)
print(stdt)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [None]:
 x = stdt
 print(x)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [None]:
print(y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

# Textual data processing

In [None]:
import numpy as np
import pandas as pd
dt= pd.read_csv('/content/diabetes.csv')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

install a kaggle directory so that we can access it faster


re is a library regular expression lib useful for scanning text

In [None]:
import re

corpus is collection of text
nltk is natural library toolkit cont several tools for text processing

In [None]:
import nltk
#nltk is entire library
from nltk.corpus import stopwords
#corpus is another library in nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#printing the stopwards
print(stopwords.words('english'))
#stopwards are the common words

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# IMPORTING DATASET FROM KAGGLE

In [None]:
! pip install -q kaggle
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"rupdip","key":"0a86360921b81b036c47d4a6bf7bc624"}'}

In [None]:

! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

ref                                                                  title                                           size  lastUpdated          downloadCount  voteCount  usabilityRating  
-------------------------------------------------------------------  ---------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
haseebindata/student-performance-predictions                         Student Performance Predictions                  9KB  2024-08-17 06:57:57           6094        130  0.9411765        
berkayalan/paris-2024-olympics-medals                                Paris 2024 Olympics Medals                       1KB  2024-08-14 11:02:45           5455         99  1.0              
cvergnolle/gold-price-and-relevant-metrics                           Gold Price & Relevant Metrics                   36KB  2024-08-13 22:08:02           1512         22  1.0              
uom190346a/sleep-and-health-metrics                         

In [None]:
! kaggle competitions download -c fake-news

Downloading fake-news.zip to /content
 86% 40.0M/46.5M [00:00<00:00, 93.9MB/s]
100% 46.5M/46.5M [00:00<00:00, 85.5MB/s]


now we need to unzip the zip files as these are in zip format

In [None]:
! unzip train.csv.zip

Archive:  train.csv.zip
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# Continued textual data processing

In [None]:
dt = pd.read_csv('/content/train.csv')
import re

In [None]:
dt.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
dt.shape

(20800, 5)

In [None]:
dt.isnull().sum()

Unnamed: 0,0
id,0
title,558
author,1957
text,39
label,0


cant replace text value with mean median so we decalare them NaN

In [None]:
#replacing missing values with null string
dt=dt.fillna('')

In [None]:
dt.isnull().sum()

in this we will analyse the data only with 2 features title and author name

In [None]:
dt['content']=dt['author'] + ' ' + dt['title']
dt.head()

In [None]:
# separating feauture and label
x=dt['content']
y=dt['label']
print(x,y)

*** STEMMING: Reducing a word to its keyword

enjoying,enjoyable,...=enjoy

In [None]:
port_stem = PorterStemmer()

In [None]:
#making a function to do stemming
def stemming(content):

  # Handle potential non-string values
    if not isinstance(content, str):
        content = str(content)  # Convert to string if necessary

  #go through the content and take only a-z or A-Z and remove the punctuations and numbers
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  #converting all uppercase to lowercase
  stemmed_content = stemmed_content.lower()
  #splitting all words
  stemmed_content = stemmed_content.split()
  #stemming all the words without the stopwards
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  #joining
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content


In [None]:
dt['content'] = dt['content'].apply(stemming)

TypeError: expected string or bytes-like object

In [None]:
x = dt['content'].values
y = dt['label'].values


converting textual data to feature vector

This will identify which words are important and which are not

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x=vectorizer.transform(x)
