## Read files into a pandas DataFrame

In [1]:
PATH="/home/kirana/Documents/phd"
DATAPATH="/home/kirana/Documents/phd/data/experiment/SMSS"

In [2]:
import fastai
from fastai.text import *
from fastai import *

In [3]:
ls {DATAPATH}

[0m[01;34meval[0m/  [01;34minter[0m/  README.md  [01;34mtraining[0m/


In [4]:
ls {DATAPATH}/training

SMSS__FULL.csv


In [5]:
ls {DATAPATH}/eval

SMSS__DEV.csv  SMSS__TEST.csv


In [6]:
df_train=pd.read_csv(f'{DATAPATH}/training/SMSS__FULL.csv',header=None,names=['text','label'])

In [7]:
df_train.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [8]:
df_valid=pd.read_csv(f'{DATAPATH}/eval/SMSS__DEV.csv',header=None,names=['text','label'])
df_test=pd.read_csv(f'{DATAPATH}/eval/SMSS__TEST.csv',header=None,names=['text','label'])

In [9]:
df_train['dstype']='train'
df_valid['dstype']='valid'
df_test['dstype']='test'

In [10]:
df=pd.concat([df_train,df_valid,df_test],0)

In [11]:
df.shape

(5574, 3)

In [12]:
df['label'].value_counts()

ham     4827
spam     747
Name: label, dtype: int64

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
myle=LabelEncoder()

In [15]:
df['label']=myle.fit_transform(df['label'])

In [16]:
df['label'].value_counts()

0    4827
1     747
Name: label, dtype: int64

## Cross-Validation

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
df_train,df_valid=train_test_split(df,train_size=0.9,test_size=0.1,random_state=11,shuffle=True)

In [19]:
df.shape,df_train.shape,df_valid.shape

((5574, 3), (5016, 3), (558, 3))

In [20]:
df.head()

Unnamed: 0,text,label,dstype
0,"Go until jurong point, crazy.. Available only ...",0,train
1,Ok lar... Joking wif u oni...,0,train
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,train
3,U dun say so early hor... U c already then say...,0,train
4,"Nah I don't think he goes to usf, he lives aro...",0,train


In [21]:
df_train_file=df_train.loc[:,['label','text']]
df_valid_file=df_valid.loc[:,['label','text']]

In [22]:
df_train_file.to_csv(f'{DATAPATH}/inter/df_train.csv',index=False)
df_valid_file.to_csv(f'{DATAPATH}/inter/df_valid.csv',index=False)



## Tokenize and Numericalize

In [23]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

def get_texts(df, n_lbls=1):
    labels = df.iloc[:, range(n_lbls)].values.astype(np.int64)

    texts = f'\n{BOS} {FLD} 1 ' + df.iloc[:,n_lbls].astype(str)
    for i in range(n_lbls + 1, len(df.columns)):
        texts += f' {FLD} {i - n_lbls} ' + df[i].astype(str)
    texts = texts.apply(fixup).values.astype(str)

    tokenizer = Tokenizer(n_cpus=10)
    tokop=tokenizer.process_all(texts)
    return tokop, list(labels)


def get_all(df, n_lbls):
    tok, labels = [], []
    #import pdb
    #pdb.set_trace()
    for i, txt in enumerate(df):
        tok_, labels_ = get_texts(txt, n_lbls)
        tok += tok_
        labels += labels_
    return tok, labels

In [24]:
chunksize=24000
chunk_train=pd.read_csv(f'{DATAPATH}/inter/df_train.csv',chunksize=chunksize)
chunk_valid=pd.read_csv(f'{DATAPATH}/inter/df_valid.csv',chunksize=chunksize)



In [25]:
train_tokens, train_labels = get_all(chunk_train, 1)
valid_tokens, valid_labels = get_all(chunk_valid, 1)


In [26]:
df_train['words']=train_tokens
df_valid['words']=valid_tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [27]:
freq = Counter(p for o in train_tokens for p in o)
freq.most_common(25)

[('xxmaj', 10350),
 ('1', 5106),
 (' \n ', 5016),
 ('xxbos', 5016),
 ('xxfld', 5016),
 ('.', 4525),
 ('xxup', 3480),
 ('i', 2648),
 ('to', 2016),
 ('you', 1998),
 (',', 1746),
 ('?', 1328),
 ('a', 1306),
 ('the', 1205),
 ('!', 1168),
 ('...', 1020),
 ('u', 1013),
 ('and', 863),
 ('in', 811),
 ('is', 807),
 ('me', 693),
 ('my', 673),
 ('it', 649),
 ('for', 635),
 ('..', 616)]

In [28]:
max_vocab = 60000
min_freq = 2

itos = [o for o, c in freq.most_common(max_vocab) if c > min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [29]:
stoi = collections.defaultdict(lambda: 0, { v: k for k, v in enumerate(itos) })
len(itos)

2750

In [30]:
trn_lm = np.array([ [stoi[o] for o in p] for p in train_tokens ])
val_lm = np.array([ [stoi[o] for o in p] for p in valid_tokens ])


In [31]:
df_train['tokens']=trn_lm
df_valid['tokens']=val_lm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [32]:
pickle.dump([df_train,df_valid,itos, train_tokens, valid_tokens, trn_lm, val_lm],open(f'{DATAPATH}/inter/dfs_tokens_fastai.pkl','wb'))

In [88]:
[df_train,df_valid,itos, train_tokens, valid_tokens, trn_lm, val_lm]=pickle.load(open(f'{DATAPATH}/inter/dfs_tokens_fastai.pkl','rb'))