Import the libraries pandas and numpy for data loading

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
!pip install nltk



read_csv() is the pandas method to load csv file from a directory

## Load the dataset

In [None]:
dataset = pd.read_csv("/content/Train_Data.csv")

In [None]:
dataset.head(10)

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0
5,james corden and the red hot chili peppers str...,0
6,u.s. dignity reserves nearly depleted,1
7,"how to re-ignite the spark in your body, mind ...",0
8,report: there still time to convert to christi...,1
9,education reform and evidence,0


# checking the info about the dataset

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44262 entries, 0 to 44261
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   headline      44262 non-null  object
 1   is_sarcastic  44262 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 691.7+ KB


## Checking for null values

In [None]:
dataset.isnull().sum()

headline        0
is_sarcastic    0
dtype: int64

From the code above we dont have null values

---



In [None]:
# features variables

In [None]:
dataset.headline

0        supreme court votes 7-2 to legalize all worldl...
1        hungover man horrified to learn he made dozens...
2        emily's list founder: women are the 'problem s...
3            send your kids back to school with confidence
4                watch: experts talk pesticides and health
                               ...                        
44257     greece seeks to reassure europe as tensions rise
44258    vatican says transgender man cannot become a g...
44259    protesters ejected from donald trump rally aft...
44260          italian recipes that are oldies but goodies
44261    area loser blissfully unaffected by whims of s...
Name: headline, Length: 44262, dtype: object

# target variable

In [None]:
dataset.is_sarcastic

0        1
1        1
2        0
3        0
4        0
        ..
44257    0
44258    0
44259    0
44260    0
44261    1
Name: is_sarcastic, Length: 44262, dtype: int64

# Preprocessing
we convert the features to lower case letter<br>
 remove special characters from the headline <br>
We tokenize the dataset <br>

- Import regular expression to look for specific character in the document
- import  word tokenize  and stop word from nltk  library


In [None]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Join tokens back into a sentence
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Appling the preprocess_text function to the 'headline' column

In [None]:
dataset['preprocessed_headline'] = dataset['headline'].apply(preprocess_text)

In [None]:
dataset.head()

Unnamed: 0,headline,is_sarcastic,preprocessed_headline
0,supreme court votes 7-2 to legalize all worldl...,1,supreme court votes legalize worldly vices
1,hungover man horrified to learn he made dozens...,1,hungover man horrified learn made dozens plans...
2,emily's list founder: women are the 'problem s...,0,emilys list founder women problem solvers cong...
3,send your kids back to school with confidence,0,send kids back school confidence
4,watch: experts talk pesticides and health,0,watch experts talk pesticides health


Split the dataset into train and test dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = dataset.preprocessed_headline

In [None]:
X.head()

0           supreme court votes legalize worldly vices
1    hungover man horrified learn made dozens plans...
2    emilys list founder women problem solvers cong...
3                     send kids back school confidence
4                 watch experts talk pesticides health
Name: preprocessed_headline, dtype: object

In [None]:
y= dataset.is_sarcastic

In [None]:
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
X_train

977      rep david cicilline lgbt people entitled full ...
18954                             inflating russian threat
11070                       former senator run pot company
34583    white liberals celebrating tomi lahrens daily ...
28075           marco rubio doesnt clue oscars white means
                               ...                        
6265     consumer financial protection bureau governmen...
11284                   never allmale panels ubs exec says
38158    dont sleep targets chic new modern home collec...
860              incredible photos show world need protect
15795    seeing fifthgrader get bullied group boys vowe...
Name: preprocessed_headline, Length: 35409, dtype: object

In [None]:
X_test

12782                             north dakota heard hours
42915    report going take way inconceivable act violen...
33043    states rights rancher ryan bundy run nevada go...
1121     watching thousands march honor unlocks deeper ...
38782                debate two unthinkable united country
                               ...                        
33747    report percent americans afraid percent americans
21926    progressive groups want doug jones throw cauti...
43084    parents wish weakwilled daughter would push ba...
12451      gay couple shares beautiful story family formed
43310    video stores favorites shelf offers telling gl...
Name: preprocessed_headline, Length: 8853, dtype: object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()

- Vectorizing the X_train and X_test
- Fit_transform X_train
- transform x_test

In [None]:
X_train_vectorizer = vectorizer.fit_transform(X_train)
X_test_vectorizer = vectorizer.transform(X_test)

Defining our model using svm

In [None]:
from sklearn import svm

In [None]:
svm_model = svm.SVC(random_state=42)
svm_model.fit(X_train_vectorizer, y_train)

Model prediction

In [None]:
predictions = svm_model.predict(X_test_vectorizer)

In [None]:
predict = pd.DataFrame(predictions)

In [None]:
predict

Unnamed: 0,0
0,1
1,1
2,0
3,1
4,0
...,...
8848,1
8849,0
8850,1
8851,0


# Evaluate the model

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy = accuracy_score(y_test, predictions)
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.9234


Classification report

In [None]:
from sklearn.metrics import classification_report

In [None]:
print("Classification Report:")
print(classification_report(y_test, predictions))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      4711
           1       0.94      0.89      0.92      4142

    accuracy                           0.92      8853
   macro avg       0.92      0.92      0.92      8853
weighted avg       0.92      0.92      0.92      8853

