In [116]:
# imports
import pandas as pd
import numpy as np
import time

# seed
np.random.seed(0)

In [117]:
# where data is located
!ls ./drive/MyDrive/Thesis\ Workspace/Notebooks/data/set-3
DATA_PATH = "./drive/MyDrive/Thesis\ Workspace/Notebooks/data/set-3"

dev  test  train


In [118]:
folder_names = ['dev', 'test', 'train']
for name in folder_names:
  print('---Folder: ', name)
  %ls $DATA_PATH\/$name

---Folder:  dev
domain.csv    following.csv  profile_info.csv      tweet.csv
follower.csv  label.csv      profile_location.csv
---Folder:  test
domain.csv    following.csv  profile_info.csv      tweet.csv
follower.csv  label.csv      profile_location.csv
---Folder:  train
domain.csv    following.csv  profile_info.csv      tweet.csv
follower.csv  label.csv      profile_location.csv


In [119]:
DATA_PATH = "./drive/MyDrive/Thesis Workspace/Notebooks/data/set-3"


def load_tweet_and_label(subfolder):
  df_tweet = pd.read_csv(f'{DATA_PATH}/{subfolder}/tweet.csv').iloc[:, 1:].reset_index(drop=True)
  df_label = pd.read_csv(f'{DATA_PATH}/{subfolder}/label.csv').iloc[:, 1:]
  df_tweet = df_tweet.merge(df_label, on='ID')
  return df_tweet

subfolder_names = ['dev', 'train', 'test']
df_tweet_dev, df_tweet_train, df_tweet_test = [load_tweet_and_label(x) for x in subfolder_names]
for folder, df in zip(subfolder_names, [df_tweet_dev, df_tweet_train, df_tweet_test]):
  print('Folder: ', folder, 'Shape: ', df.shape)
  display(df.head())
  char_count = df['tweet'].astype(str).apply(lambda x: len(x))
  print(df[char_count==0])

Folder:  dev Shape:  (401540, 3)


Unnamed: 0,ID,tweet,label
0,1224667050301255680,@SparklesOnlyme পুরোনো এইদিনের কথা\n,0
1,1224667050301255680,@BariraJahan হায়\n,0
2,1224667050301255680,সেদিন রাস্তার ধারে নুনু চুলাকাচ্ছিলাম।\n\nকে জ...,0
3,1224667050301255680,"নিজের বলতে কিছু নাইরে মাদারচোদ,\n\nসালার নুনু ...",0
4,1224667050301255680,ফোন টিপতে টিপতেই জীবন শেষ হবে অন্যকিছু আর টিপত...,0


Empty DataFrame
Columns: [ID, tweet, label]
Index: []
Folder:  train Shape:  (1398465, 3)


Unnamed: 0,ID,tweet,label
0,17461978,RT @CarnivalCruise: 🎉 Are you ready to see wha...,0
1,17461978,Who has time for receipts? Not me. @epson rece...,0
2,17461978,Steady wants to encourage you to invest in you...,0
3,17461978,"Good one, @rishid. But let’s see if y'all can ...",0
4,17461978,#lsunationalchamps\n,0


Empty DataFrame
Columns: [ID, tweet, label]
Index: []
Folder:  test Shape:  (199863, 3)


Unnamed: 0,ID,tweet,label
0,1188812492010487808,RT @clevelanddotcom: Three Ohio House Republic...,1
1,1188812492010487808,RT @CaliConserv1: California Governor Gavin Ne...,1
2,1188812492010487808,RT @NRA: Only after all our guns have been ban...,1
3,1188812492010487808,@ArtValley818_ CaliRed.\n,1
4,1188812492010487808,RT @ArtValley818_: I will be looking into star...,1


Empty DataFrame
Columns: [ID, tweet, label]
Index: []


## Regexes

In [120]:
import re

# hashtag #
test_hashtag = "Somethin big is going down, I can feel it in my @soul. . . .#somethingbig #happeningsoon #2017 #feel_it_in_my_soul #numberswork1337 ####"
HASHTAG_REGEX = r"#[a-z0-9_]+"
hashtag_matches = re.findall(HASHTAG_REGEX, test_hashtag)
print(hashtag_matches)


['#somethingbig', '#happeningsoon', '#2017', '#feel_it_in_my_soul', '#numberswork1337']


In [121]:
# mention @
test_mention = "@beggining This is a @with.dot of some cool #features that @under_score be useful but don't. look at this email@address.ignored @mention! @damm/bro make love @@ not @@@asdfaaf"
MENTION_REGEX = r"\s([@][\w_-]+)"
mention_matches = re.findall(MENTION_REGEX, test_mention)
print(mention_matches)

['@with', '@under_score', '@mention', '@damm']


In [122]:
# url google.com
test_url = """
Welcome to RegExr 0.3b, an intuitive tool for learning, writing, and testing Regular Expressions. Key features include: 
www.google.com
* real time results: shows results as you type 
* code hinting: roll over your expression to see info on specific elements 
* detailed results: roll over a match to see details & view group info below 
* built in regex guide: double click entries to insert them into your expression 
* online & desktop: regexr.com or download the desktop version for Mac, Windows, or Linux 
* save your expressions: My Saved expressions are saved locally 
* search Comm https://google.us.edi?34535/534534?dfg=g&fg unity expressions and add your own 
* create Share Links to send your expressions to co-workers or link to them on Twitter or your blog [ex. http://RegExr.com?2rjl6] 

Built by gskinner.com with Flex 3 [adobe.com/go/flex] and Spelling Plus Library for text highlighting [gskinner.com/products/spl].
"""
URL_REGEX = r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"
url_matches = re.findall(URL_REGEX, test_url)
print(url_matches)

['www.google.com', 'regexr.com', 'https://google.us.edi?34535/534534?dfg=g&fg', 'http://RegExr.com?2rjl6', 'gskinner.com', 'adobe.com/go/flex', 'gskinner.com/products/spl']


## NOTE
Missing feature:
- favorites of a tweet
- retweets of a tweet
- avg_tweet_same_time (no timestamp)

In [123]:
def feature_extraction(df):
  tweet_series = df['tweet'].astype(str)

  df = df.assign(num_hashtags = tweet_series.apply(lambda x: len(re.findall(HASHTAG_REGEX, x))))
  df = df.assign(num_mentions = tweet_series.apply(lambda x: len(re.findall(MENTION_REGEX, x))))
  df = df.assign(num_urls = tweet_series.apply(lambda x: len(re.findall(URL_REGEX, x))))
  df_grouped = df.groupby('ID')
  df_return = pd.DataFrame()

  character_per_tweet = pd.DataFrame({
      'text_len': tweet_series.apply(lambda x: len(x))
  })
  character_per_tweet['user_id'] = df['ID']
  character_per_tweet = character_per_tweet.groupby('user_id')['text_len']
  df_return['avg_characters'] = character_per_tweet.mean()
  df_return['std_characters'] = character_per_tweet.std()

  df_return['avg_hashtags'] = df_grouped['num_hashtags'].mean()
  df_return['avg_mentions'] = df_grouped['num_mentions'].mean()
  df_return['avg_urls'] = df_grouped['num_urls'].mean()

  # df_return['favorites_received'] = df_grouped['favorite_count'].sum()
  # df_return['retweets_received'] = df_grouped['retweet_count'].mean()

  # df_return['avg_tweet_same_time'] =df.groupby(['user_id', 'timestamp'])['timestamp'].agg(['count']).groupby('user_id').mean()
  df_return['label'] = df_grouped['label'].apply(lambda x: x.unique()[0])
  return df_return

In [124]:
feature_time = time.time()
df_dev = feature_extraction(df_tweet_dev)
end_feature_time = time.time()
print('Time: ', end_feature_time - feature_time)
print('shape: ', df_dev.shape)
df_dev.head()

Time:  12.350919485092163
shape:  (2365, 6)


Unnamed: 0_level_0,avg_characters,std_characters,avg_hashtags,avg_mentions,avg_urls,label
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3382,108.235,47.623627,0.01,1.045,0.235,0
13348,142.695,84.269381,0.01,1.075,0.215,0
759251,218.245,55.1181,0.0,0.15,1.12,0
813286,224.055,67.038217,0.0,0.32,0.805,0
994431,78.642857,44.271238,0.0,0.0,0.119048,0


In [125]:
feature_time = time.time()
df_train = feature_extraction(df_tweet_train)
end_feature_time = time.time()
print('Time: ', end_feature_time - feature_time)
print('shape: ', df_train.shape)
df_train.head()

Time:  42.549440145492554
shape:  (8278, 6)


Unnamed: 0_level_0,avg_characters,std_characters,avg_hashtags,avg_mentions,avg_urls,label
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12,112.140704,59.040307,0.040201,1.120603,0.39196,0
1605,148.205,72.95447,0.005,0.33,0.25,0
10350,72.89,45.363943,0.0,0.405,0.225,0
10441,115.005,62.810275,0.15,1.21,0.6,1
12830,170.585,69.144033,0.555,0.04,1.23,0


In [126]:
feature_time = time.time()
df_test = feature_extraction(df_tweet_test)
end_feature_time = time.time()
print('Time: ', end_feature_time - feature_time)
print('shape: ', df_test.shape)
df_test.head()

Time:  6.135536193847656
shape:  (1183, 6)


Unnamed: 0_level_0,avg_characters,std_characters,avg_hashtags,avg_mentions,avg_urls,label
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
691353,129.100503,63.987451,0.090452,1.366834,0.537688,0
1068831,90.74,17.731453,0.0,0.03,0.995,0
1367531,26.57,14.007001,0.0,0.0,1.0,0
5286532,122.315,75.405209,0.205,0.485,0.25,0
5894372,126.545,50.115525,0.01,0.445,0.92,0


### Check and cleanup

In [127]:
outputs = [df_dev, df_train, df_test]

In [128]:
for df in outputs:
  display(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2365 entries, 3382 to 1301535104989265920
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   avg_characters  2365 non-null   float64
 1   std_characters  2322 non-null   float64
 2   avg_hashtags    2365 non-null   float64
 3   avg_mentions    2365 non-null   float64
 4   avg_urls        2365 non-null   float64
 5   label           2365 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 129.3 KB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8278 entries, 12 to 1301790704050593792
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   avg_characters  8278 non-null   float64
 1   std_characters  8135 non-null   float64
 2   avg_hashtags    8278 non-null   float64
 3   avg_mentions    8278 non-null   float64
 4   avg_urls        8278 non-null   float64
 5   label           8278 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 452.7 KB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1183 entries, 691353 to 1301559664690319360
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   avg_characters  1183 non-null   float64
 1   std_characters  1153 non-null   float64
 2   avg_hashtags    1183 non-null   float64
 3   avg_mentions    1183 non-null   float64
 4   avg_urls        1183 non-null   float64
 5   label           1183 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 64.7 KB


None

In [129]:
for df in outputs:
  display(df.apply(lambda x: x.unique()))

avg_characters    [108.235, 142.695, 218.245, 224.055, 78.642857...
std_characters    [47.623626742622235, 84.26938126087347, 55.118...
avg_hashtags      [0.01, 0.0, 0.005, 0.04, 0.020202020202020204,...
avg_mentions      [1.045, 1.075, 0.15, 0.32, 0.0, 0.565, 1.335, ...
avg_urls          [0.235, 0.215, 1.12, 0.805, 0.1190476190476190...
label                                                        [0, 1]
dtype: object

avg_characters    [112.14070351758794, 148.205, 72.89, 115.005, ...
std_characters    [59.040307251153806, 72.95447038217547, 45.363...
avg_hashtags      [0.04020100502512563, 0.005, 0.0, 0.15, 0.555,...
avg_mentions      [1.120603015075377, 0.33, 0.405, 1.21, 0.04, 0...
avg_urls          [0.39195979899497485, 0.25, 0.225, 0.6, 1.23, ...
label                                                        [0, 1]
dtype: object

avg_characters    [129.10050251256283, 90.74, 26.57, 122.315, 12...
std_characters    [63.987451024674314, 17.731452904670636, 14.00...
avg_hashtags      [0.09045226130653267, 0.0, 0.205, 0.01, 0.005,...
avg_mentions      [1.3668341708542713, 0.03, 0.0, 0.485, 0.445, ...
avg_urls          [0.5376884422110553, 0.995, 1.0, 0.25, 0.92, 0...
label                                                        [0, 1]
dtype: object

In [130]:
for df in outputs:
  display(df.isin([np.nan, np.inf, -np.inf]).sum())

avg_characters     0
std_characters    43
avg_hashtags       0
avg_mentions       0
avg_urls           0
label              0
dtype: int64

avg_characters      0
std_characters    143
avg_hashtags        0
avg_mentions        0
avg_urls            0
label               0
dtype: int64

avg_characters     0
std_characters    30
avg_hashtags       0
avg_mentions       0
avg_urls           0
label              0
dtype: int64

### NOTE
- As seen above, there are `Nan` values for std
- This happends due to the formula of std of sample which has degree of freedom being `N-1`
- So if sample only have one value, STD will be `Nan`
- Simple solution is to set `Nan` STD as `0`

### Fix Nan STD
std_characters    

In [131]:
df_dev['std_characters'] = df_dev['std_characters'].fillna(0)
df_train['std_characters'] = df_train['std_characters'].fillna(0)
df_test['std_characters'] = df_test['std_characters'].fillna(0)

In [132]:
for df in [df_dev, df_train, df_test]:
  display(df.isin([np.nan, np.inf, -np.inf]).sum())

avg_characters    0
std_characters    0
avg_hashtags      0
avg_mentions      0
avg_urls          0
label             0
dtype: int64

avg_characters    0
std_characters    0
avg_hashtags      0
avg_mentions      0
avg_urls          0
label             0
dtype: int64

avg_characters    0
std_characters    0
avg_hashtags      0
avg_mentions      0
avg_urls          0
label             0
dtype: int64

In [135]:
save_path = "./drive/MyDrive/Thesis Workspace/Notebooks/data/sb15-set3/"
for name, df in zip(subfolder_names, [df_dev, df_train, df_test]):
  df.to_csv(f'{save_path}{name}.csv', index = False)
