## Read files into a pandas DataFrame

In [1]:
PATH="/home/kirana/Documents/phd"
DATAPATH="/home/kirana/Documents/phd/data/experiment/SST_2"

In [2]:
import fastai
from fastai.text import *
from fastai import *

In [3]:
ls {DATAPATH}

[0m[01;34meval[0m/  README.md  [01;34mtraining[0m/


In [4]:
ls {DATAPATH}/training

SST_2__FULL.csv


In [5]:
ls {DATAPATH}/eval

SST_2__DEV.csv  SST_2__TEST.csv


In [7]:
df_train=pd.read_csv(f'{DATAPATH}/training/SST_2__FULL.csv',header=None,names=['text','label'])

In [8]:
df_train.head()

Unnamed: 0,text,label
0,! Brilliant,positive
1,! Brilliant !,positive
2,! Brilliant ! ',positive
3,! Gollum's ` performance ' is incredible,positive
4,"! Oh , look at that clever angle ! Wow , a jum...",negative


In [9]:
df_valid=pd.read_csv(f'{DATAPATH}/eval/SST_2__DEV.csv',header=None,names=['text','label'])
df_test=pd.read_csv(f'{DATAPATH}/eval/SST_2__TEST.csv',header=None,names=['text','label'])

In [10]:
df_train['dstype']='train'
df_valid['dstype']='valid'
df_test['dstype']='test'

In [11]:
df=pd.concat([df_train,df_valid,df_test],0)

In [12]:
df.shape

(119913, 3)

In [13]:
df['label'].value_counts()

positive    65468
negative    54445
Name: label, dtype: int64

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
myle=LabelEncoder()

In [16]:
df['label']=myle.fit_transform(df['label'])

In [17]:
df['label'].value_counts()

1    65468
0    54445
Name: label, dtype: int64

In [18]:
df['label']=1-df['label']

In [19]:
df['label'].value_counts()

0    65468
1    54445
Name: label, dtype: int64

## Cross-Validation

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
df_train,df_valid=train_test_split(df,train_size=0.9,test_size=0.1,random_state=11,shuffle=True)

In [22]:
df.shape,df_train.shape,df_valid.shape

((119913, 3), (107921, 3), (11992, 3))

In [23]:
df.head()

Unnamed: 0,text,label,dstype
0,! Brilliant,0,train
1,! Brilliant !,0,train
2,! Brilliant ! ',0,train
3,! Gollum's ` performance ' is incredible,0,train
4,"! Oh , look at that clever angle ! Wow , a jum...",1,train


In [24]:
df_train_file=df_train.loc[:,['label','text']]
df_valid_file=df_valid.loc[:,['label','text']]

In [26]:
df_train_file.to_csv(f'{DATAPATH}/inter/df_train.csv',index=False)
df_valid_file.to_csv(f'{DATAPATH}/inter/df_valid.csv',index=False)



In [28]:
df_train.shape

(107921, 3)

## Tokenize and Numericalize

In [29]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

def get_texts(df, n_lbls=1):
    labels = df.iloc[:, range(n_lbls)].values.astype(np.int64)

    texts = f'\n{BOS} {FLD} 1 ' + df.iloc[:,n_lbls].astype(str)
    for i in range(n_lbls + 1, len(df.columns)):
        texts += f' {FLD} {i - n_lbls} ' + df[i].astype(str)
    texts = texts.apply(fixup).values.astype(str)

    tokenizer = Tokenizer(n_cpus=10)
    tokop=tokenizer.process_all(texts)
    return tokop, list(labels)


def get_all(df, n_lbls):
    tok, labels = [], []
    #import pdb
    #pdb.set_trace()
    for i, txt in enumerate(df):
        tok_, labels_ = get_texts(txt, n_lbls)
        tok += tok_
        labels += labels_
    return tok, labels

In [30]:
chunksize=24000
chunk_train=pd.read_csv(f'{DATAPATH}/inter/df_train.csv',chunksize=chunksize)
chunk_valid=pd.read_csv(f'{DATAPATH}/inter/df_valid.csv',chunksize=chunksize)



In [31]:
train_tokens, train_labels = get_all(chunk_train, 1)
valid_tokens, valid_labels = get_all(chunk_valid, 1)


In [32]:
df_train['words']=train_tokens
df_valid['words']=valid_tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [33]:
freq = Counter(p for o in train_tokens for p in o)
freq.most_common(25)

[('1', 108001),
 (' \n ', 107921),
 ('xxbos', 107921),
 ('xxfld', 107921),
 ('xxmaj', 63398),
 ('the', 46538),
 (',', 42009),
 ('a', 36178),
 ('and', 32481),
 ('of', 29918),
 ('.', 22665),
 ('to', 21878),
 ('-', 19738),
 ('is', 15162),
 ("'s", 14796),
 ('it', 13060),
 ('that', 12908),
 ('in', 12788),
 ('as', 8529),
 ('with', 7785),
 ('for', 6998),
 ('its', 6761),
 ('film', 6600),
 ('an', 6579),
 ('movie', 6360)]

In [34]:
max_vocab = 60000
min_freq = 2

itos = [o for o, c in freq.most_common(max_vocab) if c > min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [35]:
stoi = collections.defaultdict(lambda: 0, { v: k for k, v in enumerate(itos) })
len(itos)

16206

In [36]:
trn_lm = np.array([ [stoi[o] for o in p] for p in train_tokens ])
val_lm = np.array([ [stoi[o] for o in p] for p in valid_tokens ])


In [37]:
df_train['tokens']=trn_lm
df_valid['tokens']=val_lm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [38]:
pickle.dump([df_train,df_valid,itos, train_tokens, valid_tokens, trn_lm, val_lm],open(f'{DATAPATH}/inter/dfs_tokens_fastai.pkl','wb'))

In [88]:
[df_train,df_valid,itos, train_tokens, valid_tokens, trn_lm, val_lm]=pickle.load(open(f'{DATAPATH}/inter/dfs_tokens_fastai.pkl','rb'))