In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
from matplotlib import rcParams
rcParams['font.size'] = 14
rcParams['legend.fontsize'] = 'small'
rcParams['figure.titlesize'] = 'large'

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import confusion_matrix
import pickle
from pipeline import *

In [3]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Labels,Text,Text_Tag
0,1,Says the Annies List political group supports ...,abortion
1,2,When did the decline of coal start? It started...,"energy,history,job-accomplishments"
2,3,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy
3,1,Health care reform legislation is likely to ma...,health-care
4,2,The economic turnaround started at the end of ...,"economy,jobs"


In [4]:
y = df.pop('Labels')
np.unique(y)
# 0: Barely-True
# 1: False
# 2: Half-True 
# 3: Mostly-True 
# 4: Not-Known 
# 5: True 

array([0, 1, 2, 3, 4, 5])

In [6]:
X = df['Text'].tolist()
docs_lem = cleaning_pipeline(X)

Lowercase:
['says the annies list political group supports third-trimester abortions on demand.']

Punctuation Removed:
['says the annies list political group supports thirdtrimester abortions on demand']

Stopwords Removed:
['says annies list political group supports thirdtrimester abortions demand']

Accents Removed:
['says annies list political group supports thirdtrimester abortions demand']

Tokenized:
[['says', 'annies', 'list', 'political', 'group', 'supports', 'thirdtrimester', 'abortions', 'demand']]

Lemmatized:
['say annies list political group support thirdtrimester abortion demand']


## https://github.com/DatoJanez/electra-ka

```python
from transformers import ElectraTokenizerFast
model = ElectraForSequenceClassification.from_pretrained("./electra-ka-fake-news-tagging")
tokenizer = ElectraTokenizerFast.from_pretrained("./electra-ka-fake-news-tagging/")

inputs = tokenizer("your text goes here...", return_tensors="pt")
predictions = model(**inputs)
```

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("../../jnz/electra-ka-fake-news-tagging")

model = AutoModelForSequenceClassification.from_pretrained("../../jnz/electra-ka-fake-news-tagging")

In [24]:
docs_lem[0], y[0]

('say annies list political group support thirdtrimester abortion demand', 1)

In [47]:
to_predict = docs_lem[:10]

for i, row in enumerate(to_predict):
    print(f'{row}: {y[i]}')
    inputs = tokenizer(row, return_tensors='pt')
    predict = model(**inputs)
    for i in range(len(np.unique(y))):
        predictions = predict[0].tolist()[0]
        level = np.argsort(predict[0].tolist()[0])[i]
        print(f'{level}: {predictions[level]}')
    print('\n')

say annies list political group support thirdtrimester abortion demand: 1
4: -5.9652419090271
5: -1.4457597732543945
3: -0.8072352409362793
1: -0.4224981367588043
2: -0.3576163053512573
0: 3.331812620162964


decline coal start started natural gas took started begin president george w bush administration: 2
4: -4.058540344238281
5: -1.1565181016921997
3: -0.3797895610332489
1: -0.16735374927520752
2: 0.1972362995147705
0: 1.749159574508667


hillary clinton agrees john mccain voting give george bush benefit doubt iran: 3
5: -2.460890054702759
4: -1.6862925291061401
0: -0.8812388777732849
3: 0.17314687371253967
2: 0.9701154232025146
1: 1.9799727201461792


health care reform legislation likely mandate free sex change surgery: 1
4: -5.374716281890869
5: -1.8953756093978882
3: -1.09137761592865
1: -0.2838018238544464
2: 0.33338311314582825
0: 3.0865705013275146


economic turnaround started end term: 2
4: -3.358527183532715
5: -1.8341586589813232
3: -0.7119676470756531
1: 0.28294309973716