In [10]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
import catboost as cb

In [4]:
train_df = pd.read_csv('data/external/train.csv')
train_df

Unnamed: 0,questions,answer
0,Between the TechCrunch report on Sam Bankman-F...,yes
1,Between the report from The Verge on Apple's d...,no
2,Between the Polygon article published on Septe...,yes
3,Did the reporting on player actions in sports ...,no
4,Does the Sporting News article claim that Caes...,yes
...,...,...
1976,Did Engadget fail to report a discount on the ...,no
1977,"Between the TechCrunch article on December 7, ...",yes
1978,Did the FOX News - Entertainment article attri...,yes
1979,"Which company, covered by The Verge for exclus...",valve


In [5]:
extra_df = pd.read_csv('data/external/extra_df.csv')
extra_df

Unnamed: 0,title,author,url,source,category,fact
0,The FTX trial is bigger than Sam Bankman-Fried,Elizabeth Lopatto,https://www.theverge.com/2023/9/28/23893269/ft...,The Verge,technology,"Before his fall, Bankman-Fried made himself ou..."
1,"SBF’s trial starts soon, but how did he — and ...",Jacquelyn Melinek,https://techcrunch.com/2023/10/01/ftx-lawsuit-...,TechCrunch,technology,The highly anticipated criminal trial for Sam ...
2,"Sam Altman backs teens’ startup, Google unveil...",Kyle Wiggers,https://techcrunch.com/2023/10/07/sam-altman-b...,TechCrunch,technology,The prosecution painted Bankman-Fried as someo...
3,Donald Trump defrauded banks with 'fantasy' to...,"Michael R. Sisak, The Associated Press",https://fortune.com/2023/09/26/donald-trump-fr...,Fortune,business,No apartment in New York City has ever sold fo...
4,The $777 million surprise: Donald Trump is get...,Tom Maloney,https://www.theage.com.au/business/companies/t...,The Age,business,The prosecution argues that was to mask a drop...
...,...,...,...,...,...,...
6079,SBF Trial: The latest updates from the FTX col...,Morgan Little,https://techcrunch.com/2023/10/06/sbf-trial-th...,TechCrunch,technology,The second week of the trial’s standout testim...
6080,"Sam Altman backs teens’ startup, Google unveil...",Kyle Wiggers,https://techcrunch.com/2023/10/07/sam-altman-b...,TechCrunch,technology,The prosecution painted Bankman-Fried as someo...
6081,Israel's blockade of Gaza means that a region ...,"Topher L. McDougal, The Conversation",https://fortune.com/2023/10/13/israel-blockade...,Fortune,business,"Israel stopped allowing deliveries of food, fu..."
6082,Israel's blockade of Gaza means that a region ...,"Topher L. McDougal, The Conversation",https://fortune.com/2023/10/13/israel-blockade...,Fortune,business,"While Israel has granted permits to about 17,0..."


In [46]:
import json
with open('a.json', 'w') as f:
    json.dump(list(train_df['answer'].unique()), f)

In [49]:
train_df['answer'].value_counts()

answer
yes                         578
no                          429
insufficient information    229
sam bankman-fried           215
google                      159
                           ... 
change                        1
consistency                   1
constantinople                1
alan turing                   1
cybertruck                    1
Name: count, Length: 152, dtype: int64

In [43]:
extra_df['category'].value_counts()

category
technology       3293
sports           1263
business          950
entertainment     512
science            43
health             23
Name: count, dtype: int64

In [42]:
train_df['questions'].apply(len).describe()

count    1981.000000
mean      290.212519
std        75.122289
min        21.000000
25%       251.000000
50%       289.000000
75%       337.000000
max       652.000000
Name: questions, dtype: float64

In [19]:
train_df['answer_'] = train_df['answer'].apply(lambda x: ['yes', 'no', 'insufficient information'].index(x) if x in ['yes', 'no', 'insufficient information'] else 3)
train_df

Unnamed: 0,questions,answer,answer_
0,Between the TechCrunch report on Sam Bankman-F...,yes,0
1,Between the report from The Verge on Apple's d...,no,1
2,Between the Polygon article published on Septe...,yes,0
3,Did the reporting on player actions in sports ...,no,1
4,Does the Sporting News article claim that Caes...,yes,0
...,...,...,...
1976,Did Engadget fail to report a discount on the ...,no,1
1977,"Between the TechCrunch article on December 7, ...",yes,0
1978,Did the FOX News - Entertainment article attri...,yes,0
1979,"Which company, covered by The Verge for exclus...",valve,3


In [39]:
params = {
    'iterations': 10000,
    'depth': 8,
    'learning_rate': 0.01,
    'l2_leaf_reg': 4,
    'subsample': 0.9,
    # 'colsample_bylevel': 0.8,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'bootstrap_type': 'Bernoulli',
    'task_type': 'GPU',
    'boosting_type': 'Plain',
}
model = cb.CatBoostClassifier(**params)

In [40]:
train_X, test_X, train_y, test_y = train_test_split(train_df[['questions']], train_df['answer_'],
                                                    test_size=0.2, shuffle=True, random_state=42)

train_dataset = cb.Pool(data=train_X,
                        label=train_y,
                        text_features=['questions'])

eval_dataset = cb.Pool(data=test_X,
                       label=test_y,
                       text_features=['questions'])

model.fit(
    train_dataset,
    eval_set=eval_dataset,
    early_stopping_rounds=200,
    verbose=100,
)

0:	learn: 0.8263889	test: 0.7984887	best: 0.7984887 (0)	total: 8.33ms	remaining: 1m 23s
100:	learn: 0.8377525	test: 0.8110831	best: 0.8161209 (65)	total: 725ms	remaining: 1m 11s
200:	learn: 0.8459596	test: 0.8110831	best: 0.8161209 (65)	total: 1.44s	remaining: 1m 10s
bestTest = 0.8161209068
bestIteration = 65
Shrink model to first 66 iterations.


<catboost.core.CatBoostClassifier at 0x75f55424de90>

In [47]:
test_df = pd.read_csv('data/external/test.csv')
test_df

Unnamed: 0,ID,questions
0,0,"After the TechCrunch report on November 18, 20..."
1,1,Considering the information from an article by...
2,2,Considering the information from an article in...
3,3,Was Owen Teale's career impact discussed in Th...
4,4,What company developed the world’s first succe...
...,...,...
843,843,Who was the CEO of Alameda and former girlfrie...
844,844,Does the article from The Verge suggest that G...
845,845,Does the Polygon article suggest that 'The Pos...
846,846,Between the report by FOX News - Health on 'Pe...


In [48]:
samsub_df = pd.read_csv('data/external/sample_submission.csv')
samsub_df

Unnamed: 0,ID,answer
0,0,yes
1,1,yes
2,2,yes
3,3,yes
4,4,yes
...,...,...
843,843,yes
844,844,yes
845,845,yes
846,846,yes
