# Learning to Detect Fake Content on Twitter

## Contents

* Introduction
* Data
* Feature Engineering
* Classification
* Results
* Tweet Verification Assistant

In [175]:
# import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import json
import seaborn as sns
pd.options.display.max_columns = 100

## Data
### Load and Explore the Data

In [176]:
# load the data
data  = pd.read_csv('data/tweets_info.csv')
data['id'] = data['id'].astype(str)
data.shape

(17857, 4)

In [177]:
data.head()

Unnamed: 0,id,text,event,label
0,324597532548276224,Don't need feds to solve the #bostonbombing wh...,Boston,fake
1,325145334739267584,PIC: Comparison of #Boston suspect Sunil Tripa...,Boston,fake
2,325152091423248385,I'm not completely convinced that it's this Su...,Boston,fake
3,324554646976868352,Brutal lo que se puede conseguir en colaboraci...,Boston,fake
4,324315545572896768,4chan and the bombing. just throwing it out th...,Boston,fake


In [178]:
# load the tweet data
with open('data/tweetsFeatsVMU2016.txt') as f:
    lines = f.readlines()

lines = [json.loads(line) for line in lines]
tweet_data  = pd.DataFrame(lines)
tweet_data.drop(columns=['_id', 'annotation'], inplace=True)
tweet_data['id'] = tweet_data['id'].astype(str)
print('Data has {} samples with {} attributes'.format(*tweet_data.shape))

Data has 17857 samples with 33 attributes


In [179]:
tweet_data.tail()

Unnamed: 0,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,containsExclamationMark,containsFirstOrderPron,containsHappyEmo,containsQuestionMark,containsSadEmo,containsSecondOrderPron,containsThirdOrderPron,hasColon,hasExternalLink,hasPlease,id,itemLength,numExclamationMark,numHashtags,numMentions,numNegSentiWords,numNouns,numPosSentiWords,numQuestionMark,numSlangs,numURLs,numUppercaseChars,numWords,readability,retweetCount,urlHarmonic,urlIndegree,wotSafe,wotTrust
17852,79.0,28.0,344.0,272.0,False,False,False,False,False,False,False,True,True,False,712226185707712512,129,0,1,0,0.0,5.0,0.0,0,1.0,2,14,13,41.852,0,22020096.0,118384.0,,63.0
17853,79.0,28.0,344.0,272.0,False,False,False,False,False,False,False,True,True,False,712226175721066496,119,0,1,0,0.0,5.0,0.0,0,1.0,2,6,12,61.325,63,22020096.0,118384.0,,63.0
17854,,,,,False,False,False,False,False,False,False,True,False,False,712222764514484224,135,0,0,0,0.0,9.0,0.0,0,2.0,1,3,19,61.89,0,,,,
17855,,,,,False,False,False,False,False,False,False,True,False,False,712210186438631424,115,0,0,0,0.0,5.0,0.0,0,0.0,1,9,13,37.455,0,,,,
17856,,,,,False,False,False,False,False,False,False,True,False,False,712210186350493696,115,0,0,0,0.0,5.0,0.0,0,0.0,1,9,13,37.455,0,,,,


In [180]:
# load the user data
with open('data/userFeatsVMU2016.txt') as f:
    lines = f.readlines()

lines = [json.loads(line) for line in lines]
user_data = pd.DataFrame(lines)
user_data.drop(columns=['_id', 'annotation'], inplace=True)
user_data['id'] = user_data['id'].astype(str)

print('User data has {} samples with {} attributes'.format(*user_data.shape))


User data has 17857 samples with 25 attributes


In [181]:
user_data.head()

Unnamed: 0,FolFrieRatio,accountAge,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,harmonic,hasBio,hasExistingLocation,hasHeaderImg,hasLocation,hasProfileImg,hasURL,id,indegree,isVerified,numFavorites,numFollowers,numFriends,numMediaContent,numTweets,timesListed,tweetRatio,username,wotTrustUser
0,5.833922,1234982831,,,,,,True,False,False,True,True,False,263046056240115712,,False,5243,1651,283,1069,43811,64,32.47286,iAnnieM,
1,43.68981,1284854676,,,,,18350080.0,True,True,False,True,True,True,262995061304852481,8.0,False,45,95637,2189,10400,54293,737,70.346275,CarlosVerareal,
2,0.862576,1271108498,,,,,,True,False,False,True,True,False,262979898002534400,,False,274,1701,1972,6455,34414,13,36.97036,LucasPalape,
3,0.88015,1295804773,,,,,,True,False,False,True,True,False,262996108400271360,,False,6,235,267,0,17837,0,27.651606,Haaaaarryyy,
4,0.937824,1297372694,,,,,,True,False,False,True,True,False,263018881839411200,,False,2867,181,193,1223,25754,0,41.076424,princess__natt,


### Check the missing values

In [182]:
features=tweet_data.columns.values
for feature in features:
    print(feature,'- Missing:', 
          sum(tweet_data[feature].isnull()),
          '- Unique:', len(tweet_data[feature].unique()))

alexaCountryRank - Missing: 15535 - Unique: 308
alexaDeltaRank - Missing: 15535 - Unique: 415
alexaPopularity - Missing: 15535 - Unique: 468
alexaReachRank - Missing: 15535 - Unique: 468
containsExclamationMark - Missing: 0 - Unique: 2
containsFirstOrderPron - Missing: 2710 - Unique: 3
containsHappyEmo - Missing: 0 - Unique: 2
containsQuestionMark - Missing: 0 - Unique: 2
containsSadEmo - Missing: 0 - Unique: 2
containsSecondOrderPron - Missing: 2710 - Unique: 3
containsThirdOrderPron - Missing: 2710 - Unique: 3
hasColon - Missing: 0 - Unique: 2
hasExternalLink - Missing: 0 - Unique: 2
hasPlease - Missing: 0 - Unique: 2
id - Missing: 0 - Unique: 17857
itemLength - Missing: 0 - Unique: 128
numExclamationMark - Missing: 0 - Unique: 15
numHashtags - Missing: 0 - Unique: 17
numMentions - Missing: 0 - Unique: 8
numNegSentiWords - Missing: 2710 - Unique: 8
numNouns - Missing: 2603 - Unique: 26
numPosSentiWords - Missing: 2710 - Unique: 8
numQuestionMark - Missing: 0 - Unique: 8
numSlangs - M

#### Number of positive/negative words

Quick check to see what is the nature of the tweets that present no positive sentiment words.

In [183]:
tweet_data.loc[tweet_data['numPosSentiWords'].isna()]

Unnamed: 0,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,containsExclamationMark,containsFirstOrderPron,containsHappyEmo,containsQuestionMark,containsSadEmo,containsSecondOrderPron,containsThirdOrderPron,hasColon,hasExternalLink,hasPlease,id,itemLength,numExclamationMark,numHashtags,numMentions,numNegSentiWords,numNouns,numPosSentiWords,numQuestionMark,numSlangs,numURLs,numUppercaseChars,numWords,readability,retweetCount,urlHarmonic,urlIndegree,wotSafe,wotTrust
7,,,,,False,,False,False,False,,,False,False,False,263111677485142017,69,0,5,0,,4.0,,0,,1,0,5,,0,,,,
8,,,,,False,,False,False,False,,,False,False,False,262977091983785985,36,0,2,0,,1.0,,0,,1,0,2,,0,,,,
10,,,,,False,,False,False,False,,,False,False,False,263129115207536640,45,0,3,0,,,,0,,1,0,3,,0,,,,
11,,,,,False,,False,False,False,,,False,False,False,263091320871063552,66,0,5,0,,1.0,,0,,1,0,5,,0,,,,
24,,,,,False,,False,False,False,,,False,False,False,263047501433688064,53,0,1,1,,,,0,,1,1,6,,1,,,,
25,,,,,False,,False,False,False,,,False,False,False,263033265336754176,48,0,4,0,,1.0,,0,,1,0,4,,0,,,,
28,,,,,False,,False,False,False,,,False,False,False,263422787513901056,66,0,4,0,,1.0,,0,,1,1,7,,0,,,,
29,,,,,False,,False,False,False,,,False,False,False,263060279586336768,31,0,1,0,,1.0,,0,,1,0,1,,0,,,,
34,,,,,False,,False,False,False,,,False,False,False,262976505917882368,76,0,6,0,,2.0,,0,,1,0,6,,0,,,,
37,,,,,False,,False,False,False,,,False,False,False,263018516083523584,51,0,4,0,,2.0,,0,,1,0,4,,0,,,,


Most of them are tweets with lots of hashtags, which were not taken into account while looking for sentiment words. Let's fill the nan values with 0.

In [184]:
tweet_data['numPosSentiWords'].fillna(0, inplace=True)
tweet_data['numNegSentiWords'].fillna(0, inplace=True)

### Feature Engineering

#### Merge the features

In [185]:
# add prefix to tweet data
tweet_data = tweet_data.add_prefix('tweet_')
# add prefix to user data
user_data = user_data.add_prefix('user_')
# merge the tweet and user features
features_raw = pd.merge(tweet_data, user_data, left_on='tweet_id', right_on='user_id')
# merge with the info
features_raw = pd.merge(data,features_raw, left_on='id', right_on='tweet_id')

In [186]:
features_raw.head()

Unnamed: 0,id,text,event,label,tweet_alexaCountryRank,tweet_alexaDeltaRank,tweet_alexaPopularity,tweet_alexaReachRank,tweet_containsExclamationMark,tweet_containsFirstOrderPron,tweet_containsHappyEmo,tweet_containsQuestionMark,tweet_containsSadEmo,tweet_containsSecondOrderPron,tweet_containsThirdOrderPron,tweet_hasColon,tweet_hasExternalLink,tweet_hasPlease,tweet_id,tweet_itemLength,tweet_numExclamationMark,tweet_numHashtags,tweet_numMentions,tweet_numNegSentiWords,tweet_numNouns,tweet_numPosSentiWords,tweet_numQuestionMark,tweet_numSlangs,tweet_numURLs,tweet_numUppercaseChars,tweet_numWords,tweet_readability,tweet_retweetCount,tweet_urlHarmonic,tweet_urlIndegree,tweet_wotSafe,tweet_wotTrust,user_FolFrieRatio,user_accountAge,user_alexaCountryRank,user_alexaDeltaRank,user_alexaPopularity,user_alexaReachRank,user_harmonic,user_hasBio,user_hasExistingLocation,user_hasHeaderImg,user_hasLocation,user_hasProfileImg,user_hasURL,user_id,user_indegree,user_isVerified,user_numFavorites,user_numFollowers,user_numFriends,user_numMediaContent,user_numTweets,user_timesListed,user_tweetRatio,user_username,user_wotTrustUser
0,324597532548276224,Don't need feds to solve the #bostonbombing wh...,Boston,fake,15.0,6.0,44.0,49.0,True,True,False,False,False,False,False,False,True,False,324597532548276224,88,2,2,0,0.0,2.0,0.0,0,0.0,1,1,13,112.085,0,20971520.0,6380.0,,67.0,0.438754,1350475461,,,,,,True,True,False,True,True,True,324597532548276224,,False,168,634,1445,0,903,1,4.953727,SantaCruzShred,6.0
1,325145334739267584,PIC: Comparison of #Boston suspect Sunil Tripa...,Boston,fake,,,,,False,False,False,False,False,True,True,True,False,False,325145334739267584,135,0,1,0,1.0,7.0,0.0,0,1.0,1,17,18,30.333,0,,,,,0.479646,1235638869,,,,,,True,True,False,True,True,False,325145334739267584,,False,879,271,565,133,11972,11,7.913144,Oscar_Wang,
2,325152091423248385,I'm not completely convinced that it's this Su...,Boston,fake,,,,,False,False,False,False,False,False,False,False,False,False,325152091423248385,88,0,0,0,0.0,2.0,0.0,0,1.0,1,3,13,46.605,0,,,,,1.126736,1222220242,,,,,,True,True,False,True,True,True,325152091423248385,,False,22,649,576,762,8313,28,4.983059,jamwil,
3,324554646976868352,Brutal lo que se puede conseguir en colaboraci...,Boston,fake,15.0,6.0,44.0,49.0,False,False,False,False,False,False,False,False,True,False,324554646976868352,137,0,2,0,1.0,,0.0,0,1.0,1,1,19,,0,20971520.0,6380.0,,67.0,0.543956,1306061808,13.0,-3.0,15.0,16.0,22020096.0,True,False,False,True,True,True,324554646976868352,193085.0,False,69,297,546,545,2049,4,2.943054,rubenson80,65.0
4,324315545572896768,4chan and the bombing. just throwing it out th...,Boston,fake,,,,,False,False,False,False,False,False,True,True,False,False,324315545572896768,96,0,0,0,0.0,1.0,0.0,0,1.0,2,0,9,94.3,0,,,,,0.983607,1282240608,,,,,,True,False,False,False,True,False,324315545572896768,,False,1,60,61,66,816,0,0.840142,Slimlenny,


In [187]:
features_raw.shape

(17857, 62)

In [188]:
# keep the target and binarise
targets = features_raw['label'].map({'fake': 1, 'real': 0})
# keep the event they belong to
events = features_raw['event']
# keep their text
texts = features_raw['text']
# keep the ids
ids = features_raw['id']
# drop columns
features_raw.drop(columns=['tweet_id', 'user_id', 'user_username', 'label', 'event', 'text', 'id'], inplace=True)

In [189]:
features_raw.head()

Unnamed: 0,tweet_alexaCountryRank,tweet_alexaDeltaRank,tweet_alexaPopularity,tweet_alexaReachRank,tweet_containsExclamationMark,tweet_containsFirstOrderPron,tweet_containsHappyEmo,tweet_containsQuestionMark,tweet_containsSadEmo,tweet_containsSecondOrderPron,tweet_containsThirdOrderPron,tweet_hasColon,tweet_hasExternalLink,tweet_hasPlease,tweet_itemLength,tweet_numExclamationMark,tweet_numHashtags,tweet_numMentions,tweet_numNegSentiWords,tweet_numNouns,tweet_numPosSentiWords,tweet_numQuestionMark,tweet_numSlangs,tweet_numURLs,tweet_numUppercaseChars,tweet_numWords,tweet_readability,tweet_retweetCount,tweet_urlHarmonic,tweet_urlIndegree,tweet_wotSafe,tweet_wotTrust,user_FolFrieRatio,user_accountAge,user_alexaCountryRank,user_alexaDeltaRank,user_alexaPopularity,user_alexaReachRank,user_harmonic,user_hasBio,user_hasExistingLocation,user_hasHeaderImg,user_hasLocation,user_hasProfileImg,user_hasURL,user_indegree,user_isVerified,user_numFavorites,user_numFollowers,user_numFriends,user_numMediaContent,user_numTweets,user_timesListed,user_tweetRatio,user_wotTrustUser
0,15.0,6.0,44.0,49.0,True,True,False,False,False,False,False,False,True,False,88,2,2,0,0.0,2.0,0.0,0,0.0,1,1,13,112.085,0,20971520.0,6380.0,,67.0,0.438754,1350475461,,,,,,True,True,False,True,True,True,,False,168,634,1445,0,903,1,4.953727,6.0
1,,,,,False,False,False,False,False,True,True,True,False,False,135,0,1,0,1.0,7.0,0.0,0,1.0,1,17,18,30.333,0,,,,,0.479646,1235638869,,,,,,True,True,False,True,True,False,,False,879,271,565,133,11972,11,7.913144,
2,,,,,False,False,False,False,False,False,False,False,False,False,88,0,0,0,0.0,2.0,0.0,0,1.0,1,3,13,46.605,0,,,,,1.126736,1222220242,,,,,,True,True,False,True,True,True,,False,22,649,576,762,8313,28,4.983059,
3,15.0,6.0,44.0,49.0,False,False,False,False,False,False,False,False,True,False,137,0,2,0,1.0,,0.0,0,1.0,1,1,19,,0,20971520.0,6380.0,,67.0,0.543956,1306061808,13.0,-3.0,15.0,16.0,22020096.0,True,False,False,True,True,True,193085.0,False,69,297,546,545,2049,4,2.943054,65.0
4,,,,,False,False,False,False,False,False,True,True,False,False,96,0,0,0,0.0,1.0,0.0,0,1.0,2,0,9,94.3,0,,,,,0.983607,1282240608,,,,,,True,False,False,False,True,False,,False,1,60,61,66,816,0,0.840142,


#### One-hot encoding
Some of our features are categorical, thus we need to apply one-hot encoding to them

In [190]:
cat_cols = []
num_cols = []
for col in features_raw.columns:
    if features_raw[col].dtype==bool or features_raw[col].dtype==object:
        cat_cols.append(col)
    else:
        num_cols.append(col)

['tweet_containsExclamationMark',
 'tweet_containsFirstOrderPron',
 'tweet_containsHappyEmo',
 'tweet_containsQuestionMark',
 'tweet_containsSadEmo',
 'tweet_containsSecondOrderPron',
 'tweet_containsThirdOrderPron',
 'tweet_hasColon',
 'tweet_hasExternalLink',
 'tweet_hasPlease',
 'user_hasBio',
 'user_hasExistingLocation',
 'user_hasHeaderImg',
 'user_hasLocation',
 'user_hasProfileImg',
 'user_hasURL',
 'user_isVerified']

In [191]:
features_raw = pd.get_dummies(features_raw, columns=cat_cols)
features_raw.head()

Unnamed: 0,tweet_alexaCountryRank,tweet_alexaDeltaRank,tweet_alexaPopularity,tweet_alexaReachRank,tweet_itemLength,tweet_numExclamationMark,tweet_numHashtags,tweet_numMentions,tweet_numNegSentiWords,tweet_numNouns,tweet_numPosSentiWords,tweet_numQuestionMark,tweet_numSlangs,tweet_numURLs,tweet_numUppercaseChars,tweet_numWords,tweet_readability,tweet_retweetCount,tweet_urlHarmonic,tweet_urlIndegree,tweet_wotSafe,tweet_wotTrust,user_FolFrieRatio,user_accountAge,user_alexaCountryRank,user_alexaDeltaRank,user_alexaPopularity,user_alexaReachRank,user_harmonic,user_indegree,user_numFavorites,user_numFollowers,user_numFriends,user_numMediaContent,user_numTweets,user_timesListed,user_tweetRatio,user_wotTrustUser,tweet_containsExclamationMark_False,tweet_containsExclamationMark_True,tweet_containsFirstOrderPron_False,tweet_containsFirstOrderPron_True,tweet_containsHappyEmo_False,tweet_containsHappyEmo_True,tweet_containsQuestionMark_False,tweet_containsQuestionMark_True,tweet_containsSadEmo_False,tweet_containsSadEmo_True,tweet_containsSecondOrderPron_False,tweet_containsSecondOrderPron_True,tweet_containsThirdOrderPron_False,tweet_containsThirdOrderPron_True,tweet_hasColon_False,tweet_hasColon_True,tweet_hasExternalLink_False,tweet_hasExternalLink_True,tweet_hasPlease_False,tweet_hasPlease_True,user_hasBio_False,user_hasBio_True,user_hasExistingLocation_False,user_hasExistingLocation_True,user_hasHeaderImg_False,user_hasHeaderImg_True,user_hasLocation_False,user_hasLocation_True,user_hasProfileImg_False,user_hasProfileImg_True,user_hasURL_False,user_hasURL_True,user_isVerified_False,user_isVerified_True
0,15.0,6.0,44.0,49.0,88,2,2,0,0.0,2.0,0.0,0,0.0,1,1,13,112.085,0,20971520.0,6380.0,,67.0,0.438754,1350475461,,,,,,,168,634,1445,0,903,1,4.953727,6.0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0
1,,,,,135,0,1,0,1.0,7.0,0.0,0,1.0,1,17,18,30.333,0,,,,,0.479646,1235638869,,,,,,,879,271,565,133,11972,11,7.913144,,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,1,0
2,,,,,88,0,0,0,0.0,2.0,0.0,0,1.0,1,3,13,46.605,0,,,,,1.126736,1222220242,,,,,,,22,649,576,762,8313,28,4.983059,,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0
3,15.0,6.0,44.0,49.0,137,0,2,0,1.0,,0.0,0,1.0,1,1,19,,0,20971520.0,6380.0,,67.0,0.543956,1306061808,13.0,-3.0,15.0,16.0,22020096.0,193085.0,69,297,546,545,2049,4,2.943054,65.0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,0,1,1,0,1,0,0,1,0,1,0,1,1,0
4,,,,,96,0,0,0,0.0,1.0,0.0,0,1.0,2,0,9,94.3,0,,,,,0.983607,1282240608,,,,,,,1,60,61,66,816,0,0.840142,,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0,1,0,1,0,0,1,1,0,1,0


### Looking at outliers

Notes/Ideas
* predict num of nouns for missing data by looking into the ratio nouns per number of words on the rest of the tweets
* linear regression for the credibility metrics

## Classification

In [161]:
targets.value_counts()

1    10634
0     7223
Name: label, dtype: int64

In [162]:
# baseline
accuracy = sum(targets==1) / len(targets)
recall = 1
precision = sum(targets==1) / len(targets) # tp + fp

f_score = 2* (precision*recall) / (precision+recall)

print('Our baseline: Accuracy {:.4f}, F-score {:.4f}'.format(accuracy, f_score))

Our baseline: Accuracy 0.5955, F-score 0.7465


### Splitting into training and testing data

In [163]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(features_raw, targets, test_size=0.2, random_state=40)
print('Train Shape:', Xtrain.shape)
print('Test Shape:', Xtest.shape)

Train Shape: (14285, 73)
Test Shape: (3572, 73)


### Scaling

In [199]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xtrain)

Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)

  return self.partial_fit(X, y)
  """
  


### Implementation: Cross validation 

In [200]:
from sklearn.model_selection import cross_val_score

def do_cv(predictor, X, y, cv):
    """
    Executes cross validation and display scores
    """
    print('### -- ### -- ' + str(type(predictor)).split('.')[-1][:-2] + ' -- ### -- ###')
    cv_score = cross_val_score(predictor, X, y, scoring='f1', cv=cv)
    print ('Mean F1 score after a 10-fold cross validation: ', cv_score.mean())
    print ('F1 score of each fold: ', cv_score)

In [201]:
# Helper function to help evaluating the model
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score, precision_score, recall_score

def display_scores(predictor, X, y):
    """
    Calculates metrics and display it
    """
    print('\n### -- ### -- ' + str(type(predictor)).split('.')[-1][:-2] + ' -- ### -- ###')
    # Getting the predicted values
    ypred = predictor.predict(X)
    ypred_score = predictor.predict_proba(X)
    
    # calculating metrics
    accuracy = accuracy_score(y, ypred)
    roc = roc_auc_score(y, pd.DataFrame(ypred_score)[1])
    confusion = confusion_matrix(y, ypred)
    fscore = fbeta_score(y, ypred, beta=0.5)
    precision = precision_score(y, ypred)
    recall = recall_score(y, ypred)
    
    print('Confusion Matrix: ', confusion)
    print('Accuracy: ', accuracy)
    print('AUC: ', roc)
    print('recall:', recall)
    print('precision:', precision)

### Choosing a classifier

In [202]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the models
clf_A = RandomForestClassifier(random_state=0)
clf_B = LogisticRegression(random_state=0)
clf_C = GradientBoostingClassifier(random_state=0)
clf_D = AdaBoostClassifier(random_state=0)

for learner in [clf_A, clf_B, clf_C, clf_D]:
    
    learner.fit(Xtrain, ytrain)
    do_cv(learner, Xtrain, ytrain, 5)
    display_scores(learner, Xtrain, ytrain)



ValueError: Input contains NaN, infinity or a value too large for dtype('float32').