In [7]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
#Stemmers remove morphological affixes from words, leaving only the word stem.
from nltk.stem.porter import PorterStemmer

#convert text into feature vectors (numerical data)
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

#model
from sklearn.linear_model import LogisticRegression

#metrics
from sklearn.metrics  import accuracy_score
dslist=stopwords.words('english')


Pre Processing

In [8]:
newsdf=pd.read_csv('train.csv')

In [9]:
newsdf.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [10]:
newsdf=newsdf.fillna(" ")

In [11]:
# drop author
newsdf=newsdf.drop(columns=['author'])

In [12]:
#combine title + text
newsdf['content']=newsdf['title']+newsdf['text']

Stemming words + removing Stop words

In [13]:
stemmer=PorterStemmer()

In [14]:
def stemmingprocess(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    singles=[stemmer.stem(x) for x in stemmed_content if not x in dslist]
    stemmed_content=" ".join(singles)

    return stemmed_content



In [15]:
newsdf['content']=newsdf['content'].apply(stemmingprocess)

In [16]:
newsdf['content']

0        hous dem aid even see comey letter jason chaff...
1        flynn hillari clinton big woman campu breitbar...
2        truth might get firedwhi truth might get fire ...
3        civilian kill singl us airstrik identifiedvide...
4        iranian woman jail fiction unpublish stori wom...
                               ...                        
20795    rapper trump poster child white supremaci rapp...
20796    n f l playoff schedul matchup odd new york tim...
20797    maci said receiv takeov approach hudson bay ne...
20798    nato russia hold parallel exercis balkansnato ...
20799    keep f aliv david swanson author activist jour...
Name: content, Length: 20800, dtype: object

Converting text to numerical vectors

In [17]:
vector=TfidfVectorizer()

In [18]:
numText = vector.fit_transform(newsdf['content'])

Spliting labels and result

In [19]:
X=numText
Y=newsdf['label']

In [20]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.9,test_size=0.1,stratify=Y,random_state=2)

Model time

In [21]:
classifier=LogisticRegression()

In [22]:
classifier.fit(X_train,Y_train)

Evaulation

In [23]:
train_set=classifier.predict(X_train)

In [24]:
train_pred=accuracy_score(train_set,Y_train)


In [43]:
print(train_set)

[1 1 1 ... 0 1 0]


In [25]:
print(f"train_set accuracy: {train_pred}")

train_set accuracy: 0.9711538461538461


In [26]:
test_set=classifier.predict(X_test)
test_pred=accuracy_score(test_set,Y_test)
print(f"train_set accuracy: {test_pred}")

train_set accuracy: 0.9360576923076923


In [27]:
train_pred

0.9711538461538461

Prediction System

In [28]:
def predict_news(news_text):
    
    #Preprocess the news text
    processed_text = stemmingprocess(news_text)

    #Vectorize the text using the loaded TfidfVectorizer
    vectorized_text = vector.transform([processed_text])

    #Predict using the loaded LogisticRegression model
    prediction = classifier.predict(vectorized_text)[0]

    return "Real" if prediction == 0 else "Fake"



'Example usage\nnews_entry = "Enter your news content here..."\nresult = predict_news(news_entry)\nprint(f"The news is predicted as: {result}")'

In [39]:
with open("input example.txt", 'r') as file:
    # Read all lines of the file into a list and strip newline characters
    lines = [line.rstrip('\n') for line in file]

# Combine the lines into a single string without extra newline characters
text = ''.join(lines)

In [40]:
text

'In a groundbreaking discovery, scientists at the International Space Research Institute claim to have found evidence of a hidden civilization thriving beneath the surface of Mars. According to the research team, data collected from the Mars Rover indicates the presence of complex structures and organized activity in underground caverns, suggesting the existence of an advanced alien society.Dr. Emily Johnson, lead researcher on the project, stated, "This is a game-changer for our understanding of the universe. We always suspected that Mars held secrets, but the extent of this discovery is beyond anything we could have imagined."The revelation has sparked widespread speculation about the origins and nature of the Martian inhabitants. Some experts believe they may have evolved independently, while others speculate they could be descendants of an ancient Earth civilization.However, skepticism remains high among the scientific community, with many calling for further investigation and veri

In [44]:
print(f"News is {predict_news(text)}")

News is Fake
