In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv('Train_Data.csv')
test=pd.read_csv('Test_Data.csv')

data.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0


In [3]:
data.shape

(44262, 2)

In [4]:
# removeing all records with no text in it

data=data[data['headline']!='']
data.shape

(44262, 2)

In [5]:
data['is_sarcastic'].value_counts()

0    23958
1    20304
Name: is_sarcastic, dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44262 entries, 0 to 44261
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   headline      44262 non-null  object
 1   is_sarcastic  44262 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [7]:
# Performing basic NLP count based feature
import string
data['char_count']=data['headline'].apply(len)
data['word_count']=data['headline'].apply(lambda x: len(x.split()))
data['word_density']=data['char_count']/(data['word_count']+1)
data['punctuation_count'] = data['headline'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
data['title_word_count'] = data['headline'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))


In [8]:
X=data.drop(['is_sarcastic','headline'],axis=1)
y=data['is_sarcastic']

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=40)

In [10]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(C=1,solver='liblinear',random_state=40,max_iter=1000)
lr.fit(X_train
       ,y_train)

pred_y=lr.predict(X_val)

from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(y_val,pred_y),'\n\n')
print(classification_report(y_val,pred_y))

[[3401 1336]
 [2140 1976]] 


              precision    recall  f1-score   support

           0       0.61      0.72      0.66      4737
           1       0.60      0.48      0.53      4116

    accuracy                           0.61      8853
   macro avg       0.61      0.60      0.60      8853
weighted avg       0.61      0.61      0.60      8853



In [11]:
import nltk
import re
import contractions

#stop_words 
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('are')
stop_words.remove('and')
stop_words.remove('to')
stop_words.remove('she')
stop_words.remove('he')
stop_words.remove('the')

#load up a simple porter stemmer

ps = nltk.porter.PorterStemmer()

def prepro(text):
    
    # expand contractions
    text = contractions.fix(text)
    
    # remove unnecessary characters
    text = re.sub(r'[^a-zA-Z]',r' ', text)
    text = re.sub(r'nbsp', r'', text)
    text = re.sub(' +', ' ', text)
    
    # simple porter stemming
    text = ' '.join([ps.stem(word) for word in text.split()])
    
    # stopwords removal
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

stp = np.vectorize(prepro)



In [12]:
X_train['headline']=data['headline']
X_val['headline']=data['headline']


X_train['clean_headline'] = stp(X_train['headline'].values)
X_val['clean_headline'] = stp(X_val['headline'].values)

X_train.head()

Unnamed: 0,char_count,word_count,word_density,punctuation_count,title_word_count,headline,clean_headline
34339,55,10,5.0,1,0,more than $30 worth of burned cds stolen from ...,worth burn cd stolen resid
43787,75,11,6.25,0,0,line of lizards winding out door outside natio...,line lizard wind door outsid nation geograph c...
20401,68,12,5.230769,0,0,all of the wacky and wonderful royal wedding m...,the wacki and wonder royal wed memorabilia buy
13274,43,6,6.142857,3,0,study: majority of 'calm downs' ineffective,studi major calm ineffect
23798,70,12,5.384615,0,0,kitchenaid announces it will lift ban on selli...,kitchenaid announc lift ban sell mixer to unw ...


In [13]:
X_train_char=X_train.drop(['clean_headline','headline'],axis=1).reset_index(drop=True)
X_val_char=X_val.drop(['clean_headline','headline'],axis=1).reset_index(drop=True)

from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer()
X_t_cv=pd.DataFrame(cv.fit_transform(X_train['clean_headline']).toarray())
X_v_cv=pd.DataFrame(cv.fit_transform(X_val['clean_headline']).toarray())



comb=[X_train_char, X_t_cv]
comb1=[X_val_char, X_v_cv]

X_train_combined = pd.concat(comb, axis=1)
X_val_combined = pd.concat(comb1, axis=1)

In [None]:
# file_path='X_train_combined.csv'
# X_train_combined.to_csv(file_path,index=False)

In [14]:
X_train_combined.shape

(35409, 16451)

In [19]:
!pip install vaex

Collecting vaex
  Using cached vaex-4.9.2-py3-none-any.whl (4.7 kB)
Collecting vaex-server<0.9,>=0.8.1
  Using cached vaex_server-0.8.1-py3-none-any.whl (23 kB)
Collecting vaex-jupyter<0.9,>=0.8.0
  Using cached vaex_jupyter-0.8.0-py3-none-any.whl (43 kB)
Collecting vaex-ml<0.18,>=0.17.0
  Using cached vaex_ml-0.17.0-py3-none-any.whl (56 kB)
Collecting vaex-astro<0.10,>=0.9.1
  Using cached vaex_astro-0.9.1-py3-none-any.whl (20 kB)
Collecting vaex-core<4.10,>=4.9.2
  Using cached vaex-core-4.9.2.tar.gz (2.2 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting vaex-viz<0.6,>=0.5.2
  Using cached vaex_viz-0.5.2-py3-none-any.whl (19 kB)
Collecting vaex-hdf5<0.13,>=0.12.2
  Using cached vaex_hdf5-0.12.2

  error: subprocess-exited-with-error
  
  Building wheel for vaex-core (pyproject.toml) did not run successfully.
  exit code: 1
  
  [258 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-310
  creating build\lib.win-amd64-cpython-310\vaex
  copying vaex\agg.py -> build\lib.win-amd64-cpython-310\vaex
  copying vaex\array_types.py -> build\lib.win-amd64-cpython-310\vaex
  copying vaex\asyncio.py -> build\lib.win-amd64-cpython-310\vaex
  copying vaex\benchmark.py -> build\lib.win-amd64-cpython-310\vaex
  copying vaex\cache.py -> build\lib.win-amd64-cpython-310\vaex
  copying vaex\column.py -> build\lib.win-amd64-cpython-310\vaex
  copying vaex\config.py -> build\lib.win-amd64-cpython-310\vaex
  copying vaex\convert.py -> build\lib.win-amd64-cpython-310\vaex
  copying vaex\cpu.py -> build\lib.win-amd64-cpython-310\vaex
  copying vaex\dataframe.py -> build\lib.win-amd64-cpython-310\vaex
  copying vaex\datafra

In [20]:
import vaex

vaex_df=vaex.from_csv(file_path,covert=True,chunk_size=50000)

vaex_df=vaex.open('X_train_combined.csv.hdf5')
vaex_df.head()

ModuleNotFoundError: No module named 'vaex'

In [15]:
lr.fit(X_train_combined
       ,y_train)

pred_y_comb=lr.predict(X_val_combined)

# from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(y_val,pred_y_comb),'\n\n')
print(classification_report(y_val,pred_y_comb))



MemoryError: Unable to allocate 4.34 GiB for an array with shape (16450, 35409) and data type int64