In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
import pandas as pd
import numpy as np
import gensim
import nltk
import spacy
import re

!pip install ydata-profiling
from ydata_profiling import ProfileReport
from nltk.corpus import stopwords



IMPORTING **DATASET**

In [3]:
news_df = pd.read_csv('/content/drive/MyDrive/Machine_Learning/projekt/data.csv')

In [5]:
news_df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [6]:
news_df.shape

(4009, 4)

In [10]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URLs      4009 non-null   object
 1   Headline  4009 non-null   object
 2   Body      3988 non-null   object
 3   Label     4009 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 125.4+ KB


PROFILING **REPORT**

In [12]:
#Generating profiling report
profile = ProfileReport(news_df, title='News Dataset Profiling Report', explorative=True)
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
# Filtering rows with 'politics' in Headline
filtered_articles = news_df[news_df['Headline'].str.contains('politics', case=False, na=False)]

# Number of articles with 'politics'
num_politics_articles = filtered_articles.shape[0]

print(f"Number of articles with 'politics' category: {num_politics_articles}")

Number of articles with 'politics' category: 9


**CLEANING**

In [14]:
news_df.isnull().sum()

Unnamed: 0,0
URLs,0
Headline,0
Body,21
Label,0


In [15]:
news_df.duplicated().sum()

0

In [16]:
news_df_cleaned = news_df.dropna()

In [18]:
news_df_cleaned.shape

(3988, 4)

In [21]:
news_df_cleaned.isnull().sum()

Unnamed: 0,0
URLs,0
Headline,0
Body,0
Label,0
Headline+Body,0


CREATING AND NEW COLUMN HEADLINE + BODY

In [19]:
news_df_cleaned['Headline+Body'] = news_df_cleaned['Headline'] + ' ' + news_df_cleaned['Body']
news_df_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df_cleaned['Headline+Body'] = news_df_cleaned['Headline'] + ' ' + news_df_cleaned['Body']


Unnamed: 0,URLs,Headline,Body,Label,Headline+Body
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,Four ways Bob Corker skewered Donald Trump Ima...
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,Linklater's war veteran comedy speaks to moder...
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,Jason Aldean opens 'SNL' with Vegas tribute Co...


In [23]:
news_df_cleaned = news_df_cleaned.drop(columns=['URLs'])

CLEANING TEXT

In [27]:
# Downloading stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
nlp = spacy.load("en_core_web_sm")

In [29]:
def preprocess_text(text):
    # Normalization of text letters
    text = text.lower()
    # Deleting numbers, special signs and extra spaces
    text = re.sub(r'\d+', '', text)  # Deleting numbers
    text = re.sub(r'\W+', ' ', text)  # Deleting extra signs
    text = re.sub(r'\s+', ' ', text).strip()  # Deleting extra spaces
    # Tokenization and deleting stop words
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    # Lematization
    lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(tokens))]
    return " ".join(lemmatized_tokens)

In [30]:
news_df_cleaned['Headline+Body'] = news_df_cleaned['Headline+Body'].apply(preprocess_text)

In [31]:
news_df_cleaned.head()

Unnamed: 0,Headline,Body,Label,Headline+Body
0,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,four way bob corker skewer donald trump image ...
1,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,linklater war veteran comedy speak modern amer...
2,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,trump fight corker jeopardize legislative agen...
3,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,egypt cheiron win tie pemex mexican onshore oi...
4,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,jason aldean open snl vegas tribute country si...


SAVING DATASET

In [33]:
news_df_cleaned.to_json('/content/drive/MyDrive/Machine_Learning/projekt/news_df_preprocessed.json', orient='records', lines=True)