# Predict the topics of each story

______________________________________________________________________

##  1- Import libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
import re
import os
from task1 import add_columns

##  2- Load the datas

In [2]:
df_train=pd.read_csv('../data/stories.csv')
df_test=pd.read_csv('../data/to_fill.csv')

## 3- Preprocessing data
- Dealing with missing data
- Removing Stop words
- Lower casing
- Tokenization
- Stemming
- Lemmatization

    -Dealing with missing data

In [3]:
np.shape(df_train) 
# output:
    # (5181, 2)
    
df_train.isna().sum()
# output:
    # body     0
    # topic    0
    # dtype: int64

df_train = df_train.replace(' ', np.nan)
df_train.isna().sum()
# output:
    # body     29
    # topic     0
    # dtype: int64

# The number of missing data is less than 10% of the total data, so deleting them is the best way.
df_train = df_train.dropna()
np.shape(df_train) 
# output:
    # (5152, 2)

df_train.head()

Unnamed: 0,body,topic
4,hello and welcome to BBC News a woman who gave...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5..."
8,news now out of North Hollywood. A 14 yearold ...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
11,homelessness his city's greatest failure. That...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '74e2..."
12,Minneapolis police officer Kim Potter guilty o...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5..."
15,Judy an update now to the wildfires that wiped...,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9a06..."


    -Preprocessing text

In [4]:
stemmer = PorterStemmer()
def process_text(text):
    # Make all the strings lowercase and remove non alphabetic characters
    text = re.sub('[^A-Za-z]', ' ', text.lower())

    # Tokenize the text
    tokenized_text = word_tokenize(text)

    # Remove the stopwords and stem each word to its root
    clean_text = [
        stemmer.stem(word) for word in tokenized_text
        if word not in stopwords.words('english')
    ]
    
    return clean_text

## 4- Prepare training data and  test data 

In [5]:
bodies_train=np.array(df_train['body'])
topics_train=np.array(df_train['topic'])
bodies_test=np.array(df_test['body'])

In [6]:
# Process the texts 
# transform the list of words back to string format to feed it to sklearn
bodies_train = [" ".join(process_text(body)) for body in bodies_train]
bodies_test = [" ".join(process_text(body)) for body in bodies_test if body!='Not found']

## 5- Convert text to vector

In [7]:
matrix = CountVectorizer()
bodies_train=matrix.fit_transform(bodies_train).toarray()
bodies_test=matrix.transform(bodies_test).toarray()

## 6- Training a Naive Bayes classifier

In [14]:
classifier = GaussianNB()
classifier.fit(bodies_train, topics_train)

# Predict with the testing set
topics_pred = classifier.predict(bodies_test)

## 7-add topic column to dataframe

In [10]:
topic=[]
j=0
for body in df_test['body']:
    if body!='Not found':
        topic.append(topics_pred[j])
        j+=1
    else:
        topic.append('Not found')

In [15]:
add_columns(df_test,['topic'],[topic])

Unnamed: 0,first_words,last_words,source_video_id,body,start,end,topic
0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell ph...",18246,Well knew. This morning police need your help ...,464928,504300,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
1,a call. San Francisco firefighters rescued a man,all the way down to the ocean. ocean.,12387,Not found,Not found,Not found,Not found
2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conserva...",16859,"Paul. Meanwhile, the state set a record in ene...",60704,100410,['83a09c6b-5f2f-421f-ae50-b38acca7e008']
3,Emergency crews in Florida continue to search for,"in Florida to more than 850,000 homes.",18246,Emergency crews in Florida continue to search ...,505290,534958,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
4,But even though the state never ordered rolling,feel since their power got cut out needlessly.,16859,But even though the state never ordered rollin...,100910,283306,['83a09c6b-5f2f-421f-ae50-b38acca7e008']
5,"aid. And today, president Joe Biden and first",to view the destruction caused by Hurricane Ian.,18246,"aid. And today, president Joe Biden and first ...",546494,592450,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
6,"In the last month, there have been numerous",are necessary to crack down on those hackers.,18246,"In the last month, there have been numerous da...",614910,699230,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
7,and the warriors are playing the Boston Celtics,that. We'll see if they get it tonight.,12387,and the warriors are playing the Boston Celtic...,419994,649122,['b49207eb-96eb-4b73-b534-adc0ef85022a']
8,And San Leandro police searching for the person,footage to try to piece together more informat...,16859,And San Leandro police searching for the perso...,578612,618980,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5..."
9,The updated Bivalent Coronavirus booster shot ...,on their vaccinations getting severe illness f...,16859,The updated Bivalent Coronavirus booster shot ...,619310,659730,['96326734-fd82-4350-b45c-513e7eb9147c']


## 8- convert dataframe to csv and rename file

In [12]:
df_test.to_csv('../data/to_fill.csv',index=False)
os.rename('../data/to_fill.csv','../data/filled.csv')