## Read files into a pandas DataFrame

In [7]:
PATH="/home/kirana/Documents/phd"
DATAPATH="/home/kirana/Documents/phd/data/experiment/YELP_BINARY"

In [8]:
import fastai
from fastai.text import *
from fastai import *

In [9]:
ls {DATAPATH}

[0m[01;34meval[0m/  [01;34minter[0m/  README.md  [01;34mtraining[0m/


In [10]:
ls {DATAPATH}/training

YELP_BINARY__FULL.csv


In [11]:
ls {DATAPATH}/eval

YELP_BINARY__TEST.csv


In [12]:
df_train=pd.read_csv(f'{DATAPATH}/training/YELP_BINARY__FULL.csv',header=None,names=['text','label'])

In [13]:
df_train.head()

Unnamed: 0,text,label
0,"Unfortunately, the frustration of being Dr. Go...",1
1,Been going to Dr. Goldberg for over 10 years. ...,2
2,I don't know what Dr. Goldberg was like before...,1
3,I'm writing this review to give you a heads up...,1
4,All the food is great here. But the best thing...,2


In [15]:
df_test=pd.read_csv(f'{DATAPATH}/eval/YELP_BINARY__TEST.csv',header=None,names=['text','label'])

In [16]:
df_train['dstype']='train'
df_test['dstype']='test'

In [17]:
df=pd.concat([df_train,df_test],0)

In [18]:
df.shape

(598000, 3)

In [19]:
df['label'].value_counts()

2    299000
1    299000
Name: label, dtype: int64

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
myle=LabelEncoder()

In [22]:
df['label']=myle.fit_transform(df['label'])

In [23]:
df['label'].value_counts()

1    299000
0    299000
Name: label, dtype: int64

## Cross-Validation

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
df_train,df_valid=train_test_split(df,train_size=0.9,test_size=0.1,random_state=11,shuffle=True)

In [26]:
df.shape,df_train.shape,df_valid.shape

((598000, 3), (538200, 3), (59800, 3))

In [27]:
df.head()

Unnamed: 0,text,label,dstype
0,"Unfortunately, the frustration of being Dr. Go...",0,train
1,Been going to Dr. Goldberg for over 10 years. ...,1,train
2,I don't know what Dr. Goldberg was like before...,0,train
3,I'm writing this review to give you a heads up...,0,train
4,All the food is great here. But the best thing...,1,train


In [28]:
df_train_file=df_train.loc[:,['label','text']]
df_valid_file=df_valid.loc[:,['label','text']]

In [29]:
df_train_file.to_csv(f'{DATAPATH}/inter/df_train.csv',index=False)
df_valid_file.to_csv(f'{DATAPATH}/inter/df_valid.csv',index=False)



## Tokenize and Numericalize

In [30]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

def get_texts(df, n_lbls=1):
    labels = df.iloc[:, range(n_lbls)].values.astype(np.int64)

    texts = f'\n{BOS} {FLD} 1 ' + df.iloc[:,n_lbls].astype(str)
    for i in range(n_lbls + 1, len(df.columns)):
        texts += f' {FLD} {i - n_lbls} ' + df[i].astype(str)
    texts = texts.apply(fixup).values.astype(str)

    tokenizer = Tokenizer(n_cpus=10)
    tokop=tokenizer.process_all(texts)
    return tokop, list(labels)


def get_all(df, n_lbls):
    tok, labels = [], []
    #import pdb
    #pdb.set_trace()
    for i, txt in enumerate(df):
        tok_, labels_ = get_texts(txt, n_lbls)
        tok += tok_
        labels += labels_
    return tok, labels

In [31]:
chunksize=24000
chunk_train=pd.read_csv(f'{DATAPATH}/inter/df_train.csv',chunksize=chunksize)
chunk_valid=pd.read_csv(f'{DATAPATH}/inter/df_valid.csv',chunksize=chunksize)



In [32]:
train_tokens, train_labels = get_all(chunk_train, 1)
valid_tokens, valid_labels = get_all(chunk_valid, 1)


In [33]:
df_train['words']=train_tokens
df_valid['words']=valid_tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [34]:
freq = Counter(p for o in train_tokens for p in o)
freq.most_common(25)

[('xxmaj', 5970616),
 ('.', 4288619),
 ('the', 3662097),
 (',', 2608744),
 ('and', 2328691),
 ('i', 2241859),
 ('to', 1820289),
 ('a', 1815936),
 ('was', 1336908),
 ('it', 1110132),
 ('of', 1067493),
 ('for', 837770),
 ('in', 831883),
 ('is', 820734),
 ('that', 726390),
 ('\n \n ', 701070),
 ('!', 680911),
 ('my', 661251),
 ('we', 639329),
 ('xxup', 623156),
 ('this', 589390),
 ('you', 585775),
 ('they', 584062),
 ('1', 583834),
 ('but', 557085)]

In [35]:
max_vocab = 60000
min_freq = 2

itos = [o for o, c in freq.most_common(max_vocab) if c > min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [36]:
stoi = collections.defaultdict(lambda: 0, { v: k for k, v in enumerate(itos) })
len(itos)

60002

In [37]:
trn_lm = np.array([ [stoi[o] for o in p] for p in train_tokens ])
val_lm = np.array([ [stoi[o] for o in p] for p in valid_tokens ])


In [38]:
df_train['tokens']=trn_lm
df_valid['tokens']=val_lm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [39]:
pickle.dump([df_train,df_valid,itos, train_tokens, valid_tokens, trn_lm, val_lm],open(f'{DATAPATH}/inter/dfs_tokens_fastai.pkl','wb'))

In [88]:
[df_train,df_valid,itos, train_tokens, valid_tokens, trn_lm, val_lm]=pickle.load(open(f'{DATAPATH}/inter/dfs_tokens_fastai.pkl','rb'))