In [6]:
import pandas as pd 
import numpy as np 


In [7]:
training_data = pd.read_csv('data/liar-plus/train2.tsv', sep='\t', header=None)
training_data.columns =['index','id', 'label', 'statement', 'subject', 'speaker', 'speaker_job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context','justification']
training_data.drop(columns=['index'], inplace=True)
training_data.head()

Unnamed: 0,id,label,statement,subject,speaker,speaker_job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,justification
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe..."
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start..."


In [8]:
df = training_data[['statement', 'party_affiliation', 'label']]
df

Unnamed: 0,statement,party_affiliation,label
0,Says the Annies List political group supports ...,republican,false
1,When did the decline of coal start? It started...,democrat,half-true
2,"Hillary Clinton agrees with John McCain ""by vo...",democrat,mostly-true
3,Health care reform legislation is likely to ma...,none,false
4,The economic turnaround started at the end of ...,democrat,half-true
...,...,...,...
10237,There are a larger number of shark attacks in ...,none,mostly-true
10238,Democrats have now become the party of the [At...,republican,mostly-true
10239,Says an alternative to Social Security that op...,republican,half-true
10240,On lifting the U.S. Cuban embargo and allowing...,democrat,false


In [9]:
df['word_count'] = df['statement'].apply(lambda x: len(x.split()) if pd.notnull(x) else 0)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['word_count'] = df['statement'].apply(lambda x: len(x.split()) if pd.notnull(x) else 0)


Unnamed: 0,statement,party_affiliation,label,word_count
0,Says the Annies List political group supports ...,republican,false,11
1,When did the decline of coal start? It started...,democrat,half-true,24
2,"Hillary Clinton agrees with John McCain ""by vo...",democrat,mostly-true,19
3,Health care reform legislation is likely to ma...,none,false,12
4,The economic turnaround started at the end of ...,democrat,half-true,10
...,...,...,...,...
10237,There are a larger number of shark attacks in ...,none,mostly-true,17
10238,Democrats have now become the party of the [At...,republican,mostly-true,14
10239,Says an alternative to Social Security that op...,republican,half-true,28
10240,On lifting the U.S. Cuban embargo and allowing...,democrat,false,11


In [10]:
df['word_count'].describe()

count    10242.000000
mean        18.027338
std          9.928149
min          0.000000
25%         12.000000
50%         17.000000
75%         22.000000
max        479.000000
Name: word_count, dtype: float64

In [11]:
datum = df.iloc[0]

In [12]:
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp(datum['statement'])
doc.vector.shape

(300,)

In [5]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = SentenceTransformer("all-MiniLM-L6-v2")

def chunk_text(text, chunk_size=200, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

def embed_statement(text):
    if pd.isnull(text) or not text.strip():
        return np.zeros(model.get_sentence_embedding_dimension())
    chunks = chunk_text(text)
    embeddings = model.encode(chunks)
    return np.mean(embeddings, axis=0)

| Label                            | Category                                                                       |
| -------------------------------- | ------------------------------------------------------------------------------ |
| **republican**                   | right-leaning                                                                  |
| **democrat**                     | left-leaning                                                                   |
| **none**                         | centrist / other (depending on usage; “none” often means no clear affiliation) |
| **organization**                 | other                                                                          |
| **independent**                  | centrist                                                                       |
| **newsmaker**                    | other                                                                          |
| **libertarian**                  | right-leaning                                                                  |
| **activist**                     | other *(activism can be left or right, context-dependent)*                     |
| **journalist**                   | other *(nominally neutral / centrist in principle)*                            |
| **columnist**                    | other *(depends on outlet or individual lean)*                                 |
| **talk-show-host**               | other *(span the spectrum; e.g., could be left or right)*                      |
| **state-official**               | other *(administrative role, not inherently partisan)*                         |
| **labor-leader**                 | left-leaning                                                                   |
| **tea-party-member**             | right-leaning                                                                  |
| **business-leader**              | right-leaning                                                                  |
| **green**                        | left-leaning                                                                   |
| **education-official**           | centrist *(typically bureaucratic / nonpartisan)*                              |
| **liberal-party-canada**         | left-leaning                                                                   |
| **government-body**              | other                                                                          |
| **Moderate**                     | centrist                                                                       |
| **democratic-farmer-labor**      | left-leaning                                                                   |
| **ocean-state-tea-party-action** | right-leaning                                                                  |
| **constitution-party**           | right-leaning                                                                  |


In [16]:
df["embedding"] = df["statement"].apply(embed_statement)
X = np.stack(df["embedding"])
y = df["party_affiliation"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["embedding"] = df["statement"].apply(embed_statement)


In [20]:
affiliation_dict = {
    "democrat": "left",
    "labor-leader": "left",
    "green": "left",
    "liberal-party-canada": "left",
    "democratic-farmer-labor": "left",

    "republican": "right",
    "libertarian": "right",
    "tea-party-member": "right",
    "business-leader": "right",
    "ocean-state-tea-party-action": "right",
    "constitution-party": "right",

    "independent": "center",
    "none": "center",
    "Moderate": "center",
    "education-official": "center",

    np.nan: "other",
    "organization": "other",
    "newsmaker": "other",
    "activist": "other",
    "journalist": "other",
    "columnist": "other",
    "talk-show-host": "other",
    "state-official": "other",
    "government-body": "other"
}

y = y.apply(lambda x: affiliation_dict[x])

In [25]:
def preprocess_data(dataframe):
    dataframe = dataframe.dropna(subset=['statement', 'party_affiliation'])
    X = np.stack(dataframe["statement"].apply(embed_statement))
    y = dataframe["party_affiliation"].apply(lambda x: affiliation_dict[x] if x in affiliation_dict else "other")
    return X, y

In [23]:
testing_data = pd.read_csv('data/liar-plus/test2.tsv', sep='\t', header=None)
testing_data.columns =['index','id', 'label', 'statement', 'subject', 'speaker', 'speaker_job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context','justification']
testing_data.drop(columns=['index'], inplace=True)
testing_data.head()

Unnamed: 0,id,label,statement,subject,speaker,speaker_job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,justification
0,11972.json,true,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview,"Meantime, engineering experts agree the wall w..."
1,11685.json,false,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference,She cited layoff notices received by the state...
2,11096.json,false,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.,"Trump said that McCain ""has done nothing to he..."
3,5209.json,half-true,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,consultant,Oregon,republican,1,1,3,1,1,a radio show,"But spending still goes up. In addition, many ..."
4,9524.json,pants-fire,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,,Wisconsin,democrat,5,7,2,2,7,a web video,Our rating A Democratic Party web video making...


In [26]:
X_test, y_test = preprocess_data(testing_data)

In [32]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [10, 20, 50, 70, 100],
    'n_estimators': [10, 50, 100, 200],
}

gs = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=params,
    scoring='accuracy',
    cv=5)

gs.fit(X, y)

print("Best parameters found: ", gs.best_params_)
clf = gs.best_estimator_

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

Best parameters found:  {'max_depth': 50, 'n_estimators': 200}
              precision    recall  f1-score   support

      center       0.67      0.01      0.02       228
        left       0.40      0.23      0.29       406
       other       0.00      0.00      0.00        55
       right       0.49      0.88      0.63       578

    accuracy                           0.48      1267
   macro avg       0.39      0.28      0.24      1267
weighted avg       0.47      0.48      0.39      1267



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
