# Dataset Statistics

In [None]:
from pandas import read_csv
import matplotlib.pyplot as plt
import pandas as pd 

from arclus.settings import PREP_ASSIGNMENTS, PREP_PREMISES, PREP_CLAIMS

----------------------------
## Fetch data 
----------------------------

In [None]:
premises_df = pd.read_csv(PREP_PREMISES, delimiter=",")
claims_df = pd.read_csv(PREP_CLAIMS, delimiter=",")
ass_df = pd.read_csv(PREP_ASSIGNMENTS, delimiter=",")

#### Claims
* add col for number of chars in claim_text
* add col for number of words in claim_text

In [None]:
claims_df["claims_n_words"] = claims_df['claim_text'].str.split().str.len()
claims_df["claims_n_chars"] = claims_df['claim_text'].str.len()
claims_df

#### Premises
* add col for number of chars in premise_text
* add col for number of words in premise_text

In [None]:
premises_df["premises_n_words"] = premises_df['premise_text'].str.split().str.len()
premises_df["premises_n_chars"] = premises_df['premise_text'].str.len()
premises_df

#### Assignments

In [None]:
ass_df

#### Extended Assignments

In [None]:
ass_extended = pd.merge(ass_df, premises_df, how='inner', on="premise_id")
ass_extended = pd.merge(ass_extended, claims_df, how='inner', on="claim_id")
ass_extended = ass_extended.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y', "Unnamed: 0"])
ass_extended

----------------------------
## Statistics
-----------------------------

#### Assignments grouped by **claim_text**

In [None]:
gb_claim_text = ass_extended.groupby(['claim_text']).agg({'premises_n_chars': ['min', 'max','mean'], 'claims_n_chars': ['min', 'max','mean'], 'premise_id':'count'}).sort_values(('premise_id', 'count'), ascending=False)
gb_claim_text

#### Assignments grouped by **premise_text** 

In [None]:
gb_premise_text = ass_extended.groupby(['premise_text']).agg({'premises_n_chars': ['min', 'max','mean'], 'claims_n_chars': ['min', 'max','mean'], 'claim_id':'count'}).sort_values(('claim_id', 'count'), ascending=False)
gb_premise_text

#### Dataset Statistics

In [None]:
statistics_df = pd.DataFrame(columns=["metric","value"])

metrics =     [
        ["---------- claims -----------", "---------------------"],
        ["# claims total",len(claims_df)],
        ["# claims in assignments",len(gb_claim_text)],
        ["AVG length of claims", ass_extended["claims_n_chars"].mean()],
        ["MAX length of claims", ass_extended["claims_n_chars"].max()],
        ["MIN length of claims", ass_extended["claims_n_chars"].min()],
        ["AVG # words per claims", ass_extended["claims_n_words"].mean()],
        ["MAX # words per claims", ass_extended["claims_n_words"].max()],
        ["MIN # words per claims", ass_extended["claims_n_words"].min()],
        ["---------- premises -----------", "---------------------"],
        ["# premises total",len(premises_df)],
        ["# premises in assignments",len(gb_premise_text)],
        ["AVG length of premises", ass_extended["premises_n_chars"].mean()],
        ["Max length of premises", ass_extended["premises_n_chars"].max()],
        ["Min length of premises", ass_extended["premises_n_chars"].min()],
        ["AVG # words per premise", ass_extended["premises_n_words"].mean()],
        ["MAX # words per premise", ass_extended["premises_n_words"].max()],
        ["MIN # words per premise", ass_extended["premises_n_words"].min()],
        ["---------- assignments -----------", "---------------------"],
        ["# assignments total", len(ass_df)],
        ["AVG # premises per claim", gb_claim_text.mean()[("premise_id","count")]],
        ["MAX # premises per claim", gb_claim_text.max()[("premise_id","count")]],
        ["MIN # premises per claim", gb_claim_text.min()[("premise_id","count")]],
        ["AVG # claims per premise", gb_premise_text.mean()[("claim_id","count")]],
        ["MAX # claims per premise", gb_premise_text.max()[("claim_id","count")]],
        ["MIN # claims per premise", gb_premise_text.min()[("claim_id","count")]],
    ]

for metric, value in metrics:
    statistics_df = statistics_df.append({"metric":metric, "value":value}, ignore_index=True)    
    
statistics_df


----------------------------
## Visualizations
---------------------------

#### Distribution of premises per claim

In [None]:
fig1, ax1 = plt.subplots(figsize=(40, 4))
ax1.set_title('Distribution of premises per claims')
plt.grid("both")
ax1.set_xlim([-1,3650])
ax1.xaxis.set_major_locator(plt.MultipleLocator(250))
ax1.boxplot(gb_claim_text[("premise_id","count")], vert=False)
plt.show()

fig1, ax1 = plt.subplots(figsize=(40, 4))
ax1.set_title('Distribution of premises per claims')
plt.grid("both")
ax1.set_xlim([-1,300])
ax1.xaxis.set_major_locator(plt.MultipleLocator(20))
ax1.boxplot(gb_claim_text[("premise_id","count")], vert=False)
plt.show()

* Most claims have between 5 and 20 premises
* Claim with most premises has 3608 premises
* Text of this claim: *'Should homework be banned?'*
* Some premises are just Kauderwelsch
* Some premises only contain Emojis

#### Distribution of claims per premise

In [None]:
fig1, ax1 = plt.subplots(figsize=(40, 4))
ax1.set_title('Distribution of claims per premises')
plt.grid("both")
ax1.boxplot(gb_premise_text[("claim_id","count")], vert=False)
plt.show()

* Premise assigned to most claims is assigned to 81 claims
* Text of this premise: *'No no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no'*

#### Distribution of Claim Length

In [None]:
fig1, ax1 = plt.subplots(figsize=(40, 4))
ax1.set_title('Claim length')
plt.grid("both")
ax1.boxplot([claims_df["claims_n_chars"],claims_df["claims_n_words"]], vert=False, labels =['# of chars (c)','# of words (c)'])
plt.show()

* There are 38 claims that just have one word as text: 

In [None]:
claims_df[claims_df["claims_n_words"]==1]

* The claim with the maximum number of words (49) is *'In May 2008, Barack Obama said he had been to fifty-seven states in the U.S. How many states are there in the US? Yes = there are 57 because the President has said so. No = there are only 50 states in the union, and the President was wrong.'*

In [None]:
claims_df[claims_df["claims_n_words"]==49]

#### Distribution of Premise Length

In [None]:
fig1, ax1 = plt.subplots(figsize=(40, 4))
ax1.set_title('Premise length')
plt.grid("both")
ax1.boxplot([premises_df["premises_n_chars"],premises_df["premises_n_words"]], vert=False, labels=['# of chars (p)','# of words (p)'])
plt.show()

* There are 308 premises that just have one word as text:

In [None]:
premises_df[premises_df["premises_n_words"]==1]

* The premise with the maximum number of words (6180) is *'I want to warn you in advance folks that ... like you and me, abdicated their responsibilities for most of their life. URL:'* (Too long to display)

In [None]:
premises_df[premises_df["premises_n_words"]==6180]

-----------------
## Investigations
-----------------

### Claim with most premises
* Sort assignments grouped by **claim_text**

In [None]:
ass_gb_c_text = ass_extended.groupby(['claim_text', 'claim_id'])
ass_gb_c_text = ass_gb_c_text.agg(['mean', 'count'])["premise_id"].sort_values(["count"], ascending=False)
ass_gb_c_text

* Display claim with most premises

In [None]:
claims_df[claims_df["claim_id"]==ass_gb_c_text.index[0][1]]

* Find premises relevant to claim with claim_id, which has the most premises

In [None]:
ids_p = ass_df[ass_df["claim_id"]==ass_gb_c_text.index[0][1]]["premise_id"]
rel_premises = premises_df[premises_df["premise_id"].isin(ids_p)]
rel_premises

* Display all premises for that claim

In [None]:
from IPython.display import display, HTML
display(HTML(rel_premises[["premise_text"]].to_html()))