# Required libraries

In [13]:
import pandas as pd
import numpy as np 
import seaborn as sns 

# scanning the file to dataframe 

In [14]:
df = pd.read_csv('train.csv')

# get sample because data is to large 

In [15]:
df = df.sample(10000)

In [16]:
df[df['label']==0].text.tolist()

['At 2:30 p. m. on Thursday, Representative Maxine Waters was on the floor of the House of Representatives, arguing for the importance of the Securities and Exchange Commission. “At this time,” Ms. Waters, Democrat of California, said, “with a bill that would basically take our cop on the block, the S. E. C. and literally obliterate  —   —  ” Alas, politics junkies, news editors and anyone else who was watching the broadcast online did not learn how that sentence ended. Ms. Waters was cut off. Instead, they heard the jangling music of a feed from RT, a   Russian television network that has been accused of helping its government interfere in the American election. Some on social media immediately assumed that the interruption, which lasted about 10 minutes, had nefarious implications.    in a statement, had a simpler explanation: It was probably a technical error.  ’s television broadcast continued uninterrupted. Noting that RT is among the news feeds it regularly monitors, it said: “We

In [17]:
from nltk.corpus import stopwords
import nltk

In [18]:
nltk.download("stopwords") # download stop words  because this is will replace it 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eslam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# shape of data frame

In [20]:
{f"number of row={df.shape[0]}  number of columns (features) = {df.shape[1]}"}

{'number of row=10000  number of columns (features) = 5'}

# number of missing values in each colums

In [21]:
df.isnull().sum() /100

id        0.00
title     2.77
author    9.74
text      0.16
label     0.00
dtype: float64

In [22]:
df = df.fillna('') # we will fill na as a string 

In [23]:
df['content'] = df['title'] +" " +df['author'] # concat 2 columns because fitting 

In [24]:
df.drop(['title' , 'author'] , axis =1 , inplace =True)

# steaming the data 

In [25]:
# now we will stemming the words : 
# example : actress > act 
from nltk.stem.porter import PorterStemmer
port_stemming = PorterStemmer()

In [26]:
def stemming (content):
    import re
    stemming_content = re.sub('^a-zA-Z' , ' '  ,content)
    stemming_content = stemming_content.lower()
    stemming_content = stemming_content.split()
    stemming_content = [port_stemming.stem(i) for i in stemming_content if not i in stopwords.words("english")]
    stemming_content = " ".join(stemming_content)
    return stemming_content

In [27]:
df['content'] = df['content'].apply(stemming)


KeyboardInterrupt



# splitting the data to label and train data 

In [None]:
x = df['content']
y = df['label']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import GradientBoostingClassifier , AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

# Implementation - Creating a Training and Predicting Pipeline

In [None]:
def xx(x):
    return x.toarray()

In [None]:
steps = list()
steps.append(("model" , TfidfVectorizer(stop_words="english")))
steps.append(('transformer' , FunctionTransformer(xx)))
steps.append(("mode" , BernoulliNB()))

In [None]:
pipeline = Pipeline(steps=steps)

# splitting the data to train and test 

In [None]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x , y , stratify= y , shuffle=True , test_size=0.30,random_state=7)

In [None]:
y_train.value_counts()

# fit the model 

In [None]:
from sklearn import set_config
set_config(display="diagram")
pipeline.fit(x_train , y_train)

# Evaluating Model Performance

In [None]:
predicted_test =  pipeline.predict(x_test)
predicted_train = pipeline.predict(x_train)

In [None]:
print(f"accuracy of train {accuracy_score(predicted_train ,y_train )}")

In [None]:
print(f"accuracy of train {accuracy_score(predicted_test ,y_test )}")

# save the model 

In [None]:
import joblib

In [None]:
x = ['content']

In [None]:
joblib.dump(pipeline , "Model.h5")
joblib.dump(x,"input.h5")

# depoly the model by streamlit 

In [None]:
%%writefile MyApp.py


import pandas as pd 
import joblib
import streamlit as st
def xx(x):
    return x.toarray()


model = joblib.load("Model.h5")
inputs = joblib.load("input.h5")

def predicted(content):
    test_dataframe = pd.DataFrame(columns=inputs)
    test_dataframe.at[0, 'content'] = content
    results = model.predict(test_dataframe)
    return results 
def main ():
    st.title("welcome in fake or real news prdictor")
    content = st.text_input("enter the content")
    
    
    
    if st.button("Predict"):
        result = predicted(content)
        st.write("this news is  {}".format(result))
if __name__ == "__main__":
    main()       

In [None]:
!streamlit run MyApp.py