In [1]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
Collecting torch_nightly
[?25l  Downloading https://download.pytorch.org/whl/nightly/cu92/torch_nightly-1.2.0.dev20190805%2Bcu92-cp36-cp36m-linux_x86_64.whl (704.8MB)
[K     |████████████████████████████████| 704.8MB 26kB/s 
[?25hInstalling collected packages: torch-nightly
Successfully installed torch-nightly-1.2.0.dev20190805+cu92


In [33]:
# import libraries
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os
import nltk
import sklearn.metrics
import sklearn

In [3]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))


Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
documents = dataset.data


In [5]:
df = pd.DataFrame({'label':dataset.target,'text':dataset.data})



In [6]:
df.shape

(11314, 2)

In [7]:
df.head()

Unnamed: 0,label,text
0,17,Well i'm not sure about the story nad it did s...
1,0,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,17,Although I realize that principle is not one o...
3,11,Notwithstanding all the legitimate fuss about ...
4,10,"Well, I will have to change the scoring on my ..."


In [8]:
df['label'].value_counts()


10    600
15    599
8     598
9     597
11    595
13    594
7     594
14    593
5     593
12    591
2     591
3     590
6     585
1     584
4     578
17    564
16    546
0     480
18    465
19    377
Name: label, dtype: int64

In [9]:
## selecting only 1 and 10 labels for binary classification


In [10]:
df=df[df['label'].isin([1,10])]


In [11]:
df

Unnamed: 0,label,text
4,10,"Well, I will have to change the scoring on my ..."
10,1,Archive-name: graphics/resources-list/part1\nL...
17,10,"\nAnd of course, Mike Ramsey was (at one time)..."
24,10,"As I promised, I would give you the name of th..."
28,10,GAME(S) OF 4/15\n---------------\nADIRONDACK 6...
...,...,...
11248,10,The Hawks win!! Jermey Roenick scored his 50 ...
11259,10,I think that NHLPA' 93 is the best video game ...
11267,1,\nI am in the market for a 24-bit graphics car...
11288,1,"Hi there,\n\nis there anybody who know a polyg..."


In [12]:
df = df.reset_index(drop=True)

In [13]:
df

Unnamed: 0,label,text
0,10,"Well, I will have to change the scoring on my ..."
1,1,Archive-name: graphics/resources-list/part1\nL...
2,10,"\nAnd of course, Mike Ramsey was (at one time)..."
3,10,"As I promised, I would give you the name of th..."
4,10,GAME(S) OF 4/15\n---------------\nADIRONDACK 6...
...,...,...
1179,10,The Hawks win!! Jermey Roenick scored his 50 ...
1180,10,I think that NHLPA' 93 is the best video game ...
1181,1,\nI am in the market for a 24-bit graphics car...
1182,1,"Hi there,\n\nis there anybody who know a polyg..."


In [14]:
df['label'].value_counts()

10    600
1     584
Name: label, dtype: int64

In [15]:
## preprocessing the data

In [16]:
df['text']= df['text'].str.replace("^[a-zA-Z]"," ")


In [17]:
#removing stopwords

nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words  = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
## tokenizing the text

In [19]:
tokenized_doc = df['text'].apply(lambda x:x.split())

#remove stopwords
tokenized_doc = tokenized_doc.apply(lambda x:[item for item in x if item not in stop_words])


In [20]:
## detokenize

detokenized_doc=[]

for i in range(len(df)):
  t = ' '.join(tokenized_doc[i])
  detokenized_doc.append(t)

df['text']=detokenized_doc


In [21]:
tokenized_doc

0       [ell,, I, change, scoring, playoff, pool., Unf...
1       [rchive-name:, graphics/resources-list/part1, ...
2       [And, course,, Mike, Ramsey, (at, one, time), ...
3       [I, promised,, I, would, give, name, Panther's...
4       [AME(S), OF, 4/15, ---------------, ADIRONDACK...
                              ...                        
1179    [Hawks, win!!, Jermey, Roenick, scored, 50, th...
1180    [think, NHLPA', 93, best, video, game, availab...
1181    [I, market, 24-bit, graphics, card, PC, (ISA, ...
1182    [there,, anybody, know, polygon_reduction, alg...
1183    [ou, might, want, clarify, 11, game, winning, ...
Name: text, Length: 1184, dtype: object

In [22]:
detokenized_doc

["ell, I change scoring playoff pool. Unfortunately I time right now, I certainly post new scoring rules tomorrow. Does matter? No, enter anyway!!! Good! -- Keith Keller LET'S GO RANGERS!!!!! LET'S GO QUAKERS!!!!! kkeller@mail.sas.upenn.edu IVY LEAGUE CHAMPS!!!!",
 "And course, Mike Ramsey (at one time) captain Buffalo prior traded Pittsburgh. Currently, Penguins 3 former captains 1 real captain (Lemieux) playing them. They rotate A's season (and even C Mario out). Even Troy Loney worn C Pens. -Jay",
 'I promised, I would give name Panther\'s president. After Huizenga announced team name, announced Bill Torrey named first president Panthers. A little Bio _Sun-Sentinel_ Torrey, architect four consecutive Stanley Cup champions persident general manager New York Islanders. Throughout 27 years NHL, Bill Torrey\'s bow ties become much signature Andre Agassi\'s hair. The Panthers introduce uniform, insignia, ticket-price information early next month. In meantime, Huizenga leaves day-to-day o

In [23]:
from sklearn.model_selection import  train_test_split

x_train , x_val = train_test_split(df,stratify=df['label'],test_size=0.3,random_state=0)



In [24]:
x_train.shape , x_val.shape

((828, 2), (356, 2))

In [25]:
data_lm = TextLMDataBunch.from_df(train_df = x_train, valid_df = x_val, path = "")


In [26]:
data_cls = TextClasDataBunch.from_df(path = "", train_df = x_train, valid_df = x_val, vocab=data_lm.train_ds.vocab, bs=32)


In [35]:

learn = language_model_learner(data_lm,  arch = AWD_LSTM, pretrained = True, drop_mult=0.7)

Downloading https://s3.amazonaws.com/fast-ai-modelzoo/wt103-fwd.tgz


In [36]:
learn.fit_one_cycle(1,1e-2)


epoch,train_loss,valid_loss,accuracy,time
0,5.346852,4.508186,0.268479,00:04


In [43]:
learn.save('model1')

learn.save_encoder('enc1')

In [44]:
learn = text_classifier_learner(data_cls,arch=AWD_LSTM,drop_mult=0.7)




In [45]:
learn.load_encoder('enc1')  

RNNLearner(data=TextClasDataBunch;

Train: LabelList (828 items)
x: TextList
xxbos xxmaj my votes ( xxup xxunk ) : xxmaj team xxup mvp : xxmaj pat xxmaj verbeek . xxmaj he fans 25 % goal mouth feeds , still 36 goals terrible start xxunk ( sp ? ) team captain throughout tough couple seasons . xxmaj honorable mention : xxmaj nick xxmaj kypreos xxmaj mark xxmaj janssens . xxmaj probably appropriate xxunk xxunk category xxup mvp , xxmaj kypreos ( 17 goals , 320 + xxup pim ) hardest working player team xxmaj janssens underrated defensive center checker . i guess i place greater emphasis hard work skill determining value . xxmaj biggest surprise : xxmaj geoff xxmaj sanderson . xxmaj he 13 goals 31 points last season center , moved left wing far put 45 goals 80 + points . xxmaj he new xxmaj whaler record 21 power play goals , coming right wing faceoff circle , garden spot . xxmaj honorable mention : xxmaj andrew xxmaj cassels xxmaj terry xxmaj yake . xxmaj the xxunk xxunk xxmaj sanderson , xx

In [52]:
learn.fit_one_cycle(1,1e-2)


epoch,train_loss,valid_loss,accuracy,time
0,0.719787,0.708318,0.494382,00:09


In [56]:
pres ,target = learn.get_preds()
predictions = np.argmax(preds,axis=1)
pd.crosstab(predictions,target)



col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
1,176,180
