In [2]:
#bag of words : span
from sklearn.feature_extraction.text import CountVectorizer
v=CountVectorizer(ngram_range=(1,3))
train_data=["The sun is shining","The weather is sweet","The sun is shining and the weather is sweet"]
v.fit(train_data)
v.vocabulary_


{'the': 14,
 'sun': 10,
 'is': 3,
 'shining': 7,
 'the sun': 15,
 'sun is': 11,
 'is shining': 4,
 'the sun is': 16,
 'sun is shining': 12,
 'weather': 19,
 'sweet': 13,
 'the weather': 17,
 'weather is': 20,
 'is sweet': 6,
 'the weather is': 18,
 'weather is sweet': 21,
 'and': 0,
 'shining and': 8,
 'and the': 1,
 'is shining and': 5,
 'shining and the': 9,
 'and the weather': 2}

In [3]:
v.transform(["The sun is shining and the weather is sweet"]).toarray()

array([[1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]])

In [4]:
v.transform(["sun is shining"]).toarray()

array([[0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [5]:
import pandas as pd
df=pd.read_json('./data/news_dataset.json')
df.shape

(12695, 2)

In [6]:
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [7]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [8]:

min_samples=1000
df_business=df[df.category=='BUSINESS'].sample(min_samples,random_state=11)
df_sports=df[df.category=='SPORTS'].sample(min_samples,random_state=11)
df_crime=df[df.category=='CRIME'].sample(min_samples,random_state=11)
df_science=df[df.category=='SCIENCE'].sample(min_samples,random_state=11)

In [9]:
df_balanced=pd.concat([df_business,df_sports,df_crime,df_science],axis=0)
df_balanced.category.value_counts()

category
BUSINESS    1000
SPORTS      1000
CRIME       1000
SCIENCE     1000
Name: count, dtype: int64

In [10]:
target={
    
    'BUSINESS':0,
    'SPORTS':1,
    'CRIME':2,
    'SCIENCE':3
}
df_balanced['target']=df_balanced['category'].map(target)

In [11]:
df_balanced

Unnamed: 0,text,category,target
1190,Corporate's Responsibility Toward Social Susta...,BUSINESS,0
2844,Volunteering Surprisingly Makes You Feel Like ...,BUSINESS,0
10064,We Might Be All Wrong About Robots Taking Our ...,BUSINESS,0
2817,"Like Prince, A Majority Of Americans Don't Hav...",BUSINESS,0
9172,8 Ways Leaders Become Better Leadership still ...,BUSINESS,0
...,...,...,...
1008,Eastern Monarch Butterflies May Be At Risk Of ...,SCIENCE,3
63,Low Energy Nuclear Reactions: Papers and Paten...,SCIENCE,3
1839,Hawking Warns We're Facing A Disaster In Next ...,SCIENCE,3
2064,Think Weather Forecasts Are Bad? Try Forecasti...,SCIENCE,3


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['text'], df_balanced['target'], test_size=0.2, random_state=11,stratify=df_balanced['target'])

In [13]:
print(X_train.shape, X_test.shape)

(3200,) (800,)


In [14]:
y_test.value_counts(normalize=True)

target
1    0.25
0    0.25
2    0.25
3    0.25
Name: proportion, dtype: float64

In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
clf=Pipeline([
    ('vect',CountVectorizer(ngram_range=(1,3))),
    ('mult',MultinomialNB())
])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.67      0.89      0.77       200
           1       0.91      0.78      0.84       200
           2       0.86      0.89      0.87       200
           3       0.91      0.72      0.80       200

    accuracy                           0.82       800
   macro avg       0.84      0.82      0.82       800
weighted avg       0.84      0.82      0.82       800



In [16]:
clf.predict(["samana money is finished"])

array([0])

In [17]:
clf.predict(["sport  is booming"])

array([1])

In [18]:
import spacy
nlp=spacy.load('en_core_web_sm')


In [19]:
def prep(x):
    doc=nlp(x)
    docn=[]
    for token in doc:
        if  not token.is_stop:
            docn.append(token)
    return " ".join(str(x) for x in docn)
newdf=df_balanced.sample(10,random_state=11)
newdf['prep']=newdf['text'].apply(prep)


In [20]:
newdf

Unnamed: 0,text,category,target,prep
7290,"Chinese Space Station, Adrift For Years, Plumm...",SCIENCE,3,"Chinese Space Station , Adrift Years , Plummet..."
12422,Serena Williams Knocked Out Of Olympics In Stu...,SPORTS,1,Serena Williams Knocked Olympics Stunning - Ro...
10011,Why Progressives Should Think Twice About Embr...,BUSINESS,0,Progressives Think Twice Embracing Uber Lyft S...
1674,North Pole Temperature Jumps Above Freezing Fr...,SCIENCE,3,North Pole Temperature Jumps Freezing Bizarre ...
12238,Accused 'Kayak Killer' Pushed Paddle Away From...,CRIME,2,Accused ' Kayak Killer ' Pushed Paddle Away Dr...
140,Canadian Skier Suffers Possible Broken Pelvis ...,SPORTS,1,Canadian Skier Suffers Possible Broken Pelvis ...
11161,Police Make Arrest In SoCal Lemonade Stand Hei...,CRIME,2,Police Arrest SoCal Lemonade Stand Heist man s...
8236,Scientists Unearth Fossils Of Rats The Size Of...,SCIENCE,3,Scientists Unearth Fossils Rats Size Small Dog...
3626,The Evolution Of Stephen Curry: The Man The NB...,SPORTS,1,Evolution Stephen Curry : Man NBA Eyes conside...
9643,This Is The Way 'Williamsburg' Ends,BUSINESS,0,Way ' Williamsburg ' Ends


In [21]:
#Exercise
df=pd.read_csv('./data/Fake.csv')
df.head()
df.drop(columns=['date','subject'],inplace=True)
df['val']=0

In [22]:
#Exercise
df2=pd.read_csv('./data/True.csv')
df2.head()
df2.drop(columns=['date','subject'],inplace=True)
df2['val']=1

In [23]:
df3=pd.concat([df,df2],axis=0)
df4=df3.sample(frac=1,random_state=11)
df4


Unnamed: 0,title,text,val
18478,PRESIDENT TRUMP Makes Huge Announcement on Oba...,PRESIDENT TRUMP S REMARKS TODAY IN MIAMI INCLU...,0
6079,Top Gun Lobbyist Threatens To Use The ‘Bullet...,A top gun lobbyist has just warned Americans t...,0
1612,Trump Just Claimed Andrew Jackson Had Psychic...,In an interview with Washington Examiner repor...,0
4577,Donald Trump Panicking As Ugly Divorce Record...,It s no secret that Donald Trump is cruel to w...,0
10175,WATCH: CONSERVATIVE SOCIAL MEDIA GIANT Announc...,"Hilarious and spot-on conservative, social-med...",0
...,...,...,...
8600,U.S. intelligence chief not ready to identify ...,"ASPEN, Colo. (Reuters) - Director of National ...",1
7259,"Microsoft Forced To Remove Racist, Sexist Rob...","After only 16 hours, Microsoft was forced to p...",0
21584,2009 DUKES OF HAZARD VIDEO EERILY PREDICTS THE...,,0
13062,Trump to deliver remarks on decision on Jerusa...,WASHINGTON (Reuters) - President Donald Trump ...,1


In [29]:
df4['text']=df4['title']+" "+df4['text']
df4.drop(columns=['title'],inplace=True)
# df4['text']=df4['text'].apply(prep)


KeyError: 'title'

In [30]:
df4=df4.sample(1000,random_state=11)

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( df4['text'], df4['val'], test_size=0.3)


In [32]:
clf=Pipeline([
    ('vect',CountVectorizer(ngram_range=(1,3))),
    ('mult',MultinomialNB())
])


In [33]:
clf.fit(X_train,y_train)


In [34]:
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       162
           1       0.99      0.95      0.97       138

    accuracy                           0.97       300
   macro avg       0.98      0.97      0.97       300
weighted avg       0.97      0.97      0.97       300



In [36]:
clf.predict(["Failed vote to oust president shakes up Peru's politics,LIMA (Reuters) - Peru’s President Pedro Pablo Kuczynski could end up the surprise winner of an attempt to oust him from power this week, after some opposition lawmakers broke ranks with party leaders to support him, opening a divide that might strengthen his hand. Despite having a Congressional majority, the rightwing opposition party Popular Force was unable to push through a motion to remove Kuczynski from office on Thursday, after 10 of its own lawmakers broke ranks to save the president. The vote cemented a growing divide in the opposition and looked to threaten its control over Congress, potentially aiding Kuczynski as he tries to restore political stability and revive investments in one of Latin America’s most robust economies. The surprise defection was the result of a deal struck between Kuczynski and Popular Force rebel lawmaker Kenji Fujimori to get his father and ex-president Alberto Fujimori out of prison, alleged Popular Force secretary general, Jose Chlimper. Over the past year, Kenji has courted Kuczynski’s center-right government while challenging his sister Keiko’s leadership of the rightwing populist movement that their father formed in the 1990s. In defiance of his sister, Kenji threw his support behind Kuczynski ahead of the vote on whether to remove him from office over unproven graft allegations. Nine other Popular Force lawmakers followed his lead. “This is the birth of a serious and formal split (in the Fujimori movement),” said Guillermo Loli, the head of political research for pollster Ipsos Peru. “Everything points to a pardon,” he added.     Kuczynski’s government denied that a pardon for Fujimori was part of its political negotiations. In an address to the nation late on Friday, Kuczynski said he would spend the coming days reflecting on his year and a half in office. “I’ll be announcing to you changes to make sure 2018 is not just a year of greater growth, but politically different,” Kuczynski said. Efforts to reach the Popular Force lawmakers who defected were not successful. One, Clayton Galvan, said on local TV channel Canal N that Alberto Fujimori called them from prison to ask them to help Kuczynski stay in power.     Alberto Fujimori, who is serving a 25-year sentence for graft and human rights crimes, is a deeply divisive figure in Peru. While many consider him a corrupt dictator, others credit him with ending an economic crisis and bloody leftist insurgency during his 1990-2000 term. Freeing him would likely anger the well-organized foes of the Fujimori clan - a mix of technocrats, leftists, human rights activists and academics. “The day (Kuczynski) signs a pardon, he loses all of those guys. Permanently,” said Harvard University political scientist Steve Levitsky. Support from the anti-Fujimori crowd was key to Kuczynski’s razor-thin victory over Keiko in last year’s presidential election, and to keeping the motion to oust him from succeeding. “Kuczynski was saved by two diametrically opposed political groups: Kenji’s group and the left, which opposes a pardon. He can’t please both of them,” said Levitsky. Kuczynski, a 79-year-old former investment banker, took office amid hopes he would usher in cleaner government and faster economic growth. Instead, a graft scandal roiling Latin America has stalled investments and ensnared him in allegations of wrongdoing. Before the vote on Thursday, Kuczynski fanned fears of a return to Peru’s authoritarian past and described the motion as part of a legislative “coup” attempt by Keiko’s supporters. Popular Force denies the charge and says the bid to remove him was part of its fight against corruption and within the bounds of the constitution. A hardline Popular Force lawmaker loyal to Keiko, Hector Becerril, said the Kenji faction represented “traitors.”  “If they have any sense of decency after this vote, the least they could do is present their resignations,” Becerril told journalists on Friday. “Hopefully today.” With 10 votes fewer, Popular Force would command 61 seats in the 130-member, single-chamber Congress, less than an absolute majority, though it would still be the biggest voting bloc. The political crisis has cost Kuczynski his interior minister, Carlos Basombrio, who announced his resignation on Friday. Kuczynski could make a decision about other Cabinet changes in coming days, his government said.            ,politicsNews,December 23, 2017 "])

array([1])

In [37]:
clf.predict(["UNIONS MUST GO BEYOND CALLING FOR A CEASE-FIRE IN GAZA The growing swell of American unions demanding a cease-fire in Gaza is heartening. But labor will have to take its antiwar commitments further than issuing statements to stop Israel’s wanton slaughter."])

array([1])