# Hashtags analysis

In [1]:
import pandas as pd
import numpy as np
import re
import string

pd.set_option('display.max_colwidth', 150)

In [2]:
df_train = pd.read_csv('./train-dataset.csv')
negative_count = sum(df_train['target'] == 0)
positive_count = sum(df_train['target'] == 1)
n_samples = df_train.shape[0]
negative_prop = negative_count / n_samples
positive_prop = positive_count / n_samples 
print(f'All samples: {n_samples}, negative: {100*negative_prop:.1f}%, positive: {100*positive_prop:.1f}%')


All samples: 6851, negative: 57.1%, positive: 42.9%


### Hypotesis - specific tags are more frequent for disaster tweets

In [3]:
hashtag_pattern = re.compile('#\w+')

def extract_hashtags(text):
    tokens = re.findall(hashtag_pattern, text)
    found_hashtags = filter(lambda token: len(token) > 0 and token[0]=='#', tokens)
    found_hashtags = list(map(lambda tag: tag.lstrip('#').lower(), found_hashtags))
    return found_hashtags


hashtags_series = []
all_hashtags = set()
PUNCTUATION = string.punctuation
hashtags_positive = {}
hashtags_negative = {}
for _, row in df_train.iterrows():
    text = row['text']
    target = int(row['target'])
    found_hashtags = extract_hashtags(text)
    for h in found_hashtags:
        all_hashtags.add(h)
    hashtags_series.append(found_hashtags)
    output = hashtags_positive if target else hashtags_negative
    for hashtag in found_hashtags:
        if hashtag in output:
            output[hashtag] = output[hashtag] + 1
        else:
            output[hashtag] = 1

df_train['hashtags'] = hashtags_series


In [4]:
df_train[['text', 'hashtags', 'target']].sample(n=10)

Unnamed: 0,text,hashtags,target
5923,Two air ambulances on scene of serious crash between two cars and lorry in ... - http://t.co/9pFEaQeSki http://t.co/fntG70rnkx | #EMSNeÛ_,[emsne],1
3014,It was finally demolished in the spring of 2013 and the property has sat vacant since. The justÛ_: saddlebrooke... http://t.co/b8n6e4rYvZ,[],0
1780,@Drsarwatzaib070 come on. IK will face MCourt for attacking parliment and hijacking TV station.,[],1
6774,Choking Hazard Prompts Recall Of Kraft Cheese Singles http://t.co/XGKyVF9t4f,[],0
1450,Lol meerkat is fucked. They will get demolished by periscope and Facebook live streaming.,[],0
3318,Tell the United Nations: Plantations are NOT forests! https://t.co/cic7h64Qv8 via @RainforestResq,[],0
3790,Crawling in my skin\nThese wounds they will not hea,[],1
369,Lightning strike in the distance via /r/pics http://t.co/iDmhSwewQw #pics,[pics],1
4668,According to prophecy and also CNN a Mac tablet will completely obliterate the need for other gadgets. CombiningÛ_ http://t.co/xfccvMXuWb,[],0
1227,./.....hmm 12000 Nigerian refugees repatriated from Cameroon http://t.co/YTW9SlWvmg /(,[],1


Lets calculate following metrics for hashcodes:
- `cnt_positive` - how many tweets labeled as disaster use the hash code.
- `cnt_negative` - how many tweets labeled as non-disaster use the hash code.
- `all_count` - all tweets having the hashcode.
- `positive_fact` - probability of using a hashtag used in disaster tweet.
- `negative_fact` - probability of using a hashtag in non-disaster tweet.
- `sentiment` - score [-1..1], scaled from `positive_fact` and an indication in which type of tweet the hashtag is used. Zero refers to no usage, or no bias.

In [5]:
# fill with zeros for not found hashtags
hashtag_counts = []
for h in all_hashtags:
    cnt_positive = hashtags_positive[h] if h in hashtags_positive else 0
    cnt_negative = hashtags_negative[h] if h in hashtags_negative else 0
    cnt_positive_adj = cnt_positive / positive_prop
    cnt_negative_adj = cnt_negative / negative_prop
    all_count = cnt_positive + cnt_negative
    positive_fact = cnt_positive / all_count if all_count > 0 else None
    sentiment = 2*(positive_fact - 0.5)
    sentiment_adj = (2*(cnt_positive_adj/(cnt_negative_adj+cnt_positive_adj)) - 1.0) if all_count > 0 else .0
    hashtag_counts.append((h, cnt_positive, cnt_negative, all_count, positive_fact, sentiment, sentiment_adj))

df_hashtags = pd.DataFrame(data=hashtag_counts, columns=['hashtag', 'cnt_positive', 'cnt_negative', 'all_count', 'positive_fact', 'sentiment', 'sentiment_adj']).set_index('hashtag')
df_hashtags.sort_values('all_count', ascending=False, inplace=True)

In [6]:
df_hashtags.count()

cnt_positive     1781
cnt_negative     1781
all_count        1781
positive_fact    1781
sentiment        1781
sentiment_adj    1781
dtype: int64

In [7]:
df_hashtags.query('all_count <3').count()

cnt_positive     1562
cnt_negative     1562
all_count        1562
positive_fact    1562
sentiment        1562
sentiment_adj    1562
dtype: int64

#### Most specific hash tags (with absolut sentiment value >= 0.5)

In [8]:
search_query = 'all_count >= 5 and (abs(sentiment) >= 0.5)'
df_hashtags.query(search_query)

Unnamed: 0_level_0,cnt_positive,cnt_negative,all_count,positive_fact,sentiment,sentiment_adj
hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
nowplaying,2,19,21,0.095238,-0.809524,-0.754213
hiroshima,19,0,19,1.000000,1.000000,1.000000
gbbo,4,14,18,0.222222,-0.555556,-0.448956
earthquake,15,0,15,1.000000,1.000000,1.000000
jobs,0,14,14,0.000000,-1.000000,-1.000000
...,...,...,...,...,...,...
lgbt,1,4,5,0.200000,-0.600000,-0.500638
libya,5,0,5,1.000000,1.000000,1.000000
mumbai,5,0,5,1.000000,1.000000,1.000000
newyork,5,0,5,1.000000,1.000000,1.000000


In [9]:
df_hashtags.query('all_count > 10').sort_values('sentiment_adj', ascending=False).head(n=20)

Unnamed: 0_level_0,cnt_positive,cnt_negative,all_count,positive_fact,sentiment,sentiment_adj
hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
hiroshima,19,0,19,1.0,1.0,1.0
earthquake,15,0,15,1.0,1.0,1.0
news,51,19,70,0.728571,0.457143,0.562637
isis,8,3,11,0.727273,0.454545,0.560392
islam,9,13,22,0.409091,-0.181818,-0.04085
best,10,17,27,0.37037,-0.259259,-0.121718
prebreak,10,17,27,0.37037,-0.259259,-0.121718
hot,10,18,28,0.357143,-0.285714,-0.149769
gbbo,4,14,18,0.222222,-0.555556,-0.448956
nowplaying,2,19,21,0.095238,-0.809524,-0.754213


In [10]:
df_hashtags.query('all_count > 10').sort_values('sentiment_adj', ascending=True).head(n=20)

Unnamed: 0_level_0,cnt_positive,cnt_negative,all_count,positive_fact,sentiment,sentiment_adj
hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
jobs,0,14,14,0.0,-1.0,-1.0
job,0,12,12,0.0,-1.0,-1.0
hiring,1,10,11,0.090909,-0.818182,-0.765059
nowplaying,2,19,21,0.095238,-0.809524,-0.754213
gbbo,4,14,18,0.222222,-0.555556,-0.448956
hot,10,18,28,0.357143,-0.285714,-0.149769
best,10,17,27,0.37037,-0.259259,-0.121718
prebreak,10,17,27,0.37037,-0.259259,-0.121718
islam,9,13,22,0.409091,-0.181818,-0.04085
isis,8,3,11,0.727273,0.454545,0.560392


#### How many tweets with most specific hash tags are used in?

In [11]:
query_cnt = df_hashtags.query(search_query)['all_count'].sum()
all_tweets = df_hashtags['all_count'].sum()
print(f'All tweets: {all_tweets}, tweets with relevant hashtags: {query_cnt} ({100*query_cnt/all_tweets:.0f}%)')

All tweets: 2990, tweets with relevant hashtags: 483 (16%)


## Correlation with the target

In [12]:
hashtags_sentiment_result = []
hashtags_sentiment_adj_result = []
for _, row in df_train.iterrows():
    hashtags = row['hashtags']
    hashtag_sentiments = []
    hashtag_sentiments_adj = []
    weights = []
    for h in hashtags:
        counts = df_hashtags.loc[h]['all_count']
        hashtag_sentiments.append(df_hashtags.loc[h]['sentiment'])
        hashtag_sentiments_adj.append(df_hashtags.loc[h]['sentiment_adj'])
        weights.append(counts)
    sentiment = np.average(hashtag_sentiments, weights=weights) if len(hashtag_sentiments) > 0 else .0
    sentiment_adj = np.sum(hashtag_sentiments_adj) if len(hashtag_sentiments_adj) > 0 else .0
    hashtags_sentiment_result.append(sentiment)
    hashtags_sentiment_adj_result.append(sentiment_adj)
df_train['hashtags_sentiment'] = hashtags_sentiment_result
df_train['hashtags_sentiment_adj'] = hashtags_sentiment_adj_result

In [13]:
df_train.sample(n=10)

Unnamed: 0,id,keyword,location,text,target,hashtags,hashtags_sentiment,hashtags_sentiment_adj
2898,7274,nuclear%20disaster,,3 Former Executives To Be Prosecuted In Fukushima Nuclear Disaster http://t.co/UmjpRRwRUU,1,[],0.0,0.0
4954,10195,violent%20storm,,Storm blitzes Traverse City disrupts Management Briefing Seminars: A violent summer storm blitzed through Tra... http://t.co/NKAW9EZqGg,1,[],0.0,0.0
3289,725,attacked,"LEALMAN, FLORIDA",Christian Attacked by Muslims at the Temple Mount after Waving Israeli Flag via Pamela Geller - ... http://t.co/LHBZHWq4B9,1,[],0.0,0.0
698,4244,drowned,,HOPE THE DROWNED @eeasterling_2,0,[],0.0,0.0
562,3522,derailment,India,Madhya Pradesh Train Derailment: Village Youth Saved Many Lives,1,[],0.0,0.0
5408,1374,blown%20up,,Turn on ESPN2 and get blown up,0,[],0.0,0.0
2736,8074,rescue,,Sammy and todd always to the rescue may not be in uniform but still to the rescue lmao. Forever KFC fam.,0,[],0.0,0.0
6666,4383,earthquake,,There has not been 1 real tear out of #Shelli 's eyes this entire episode. #bb17,0,"[shelli, bb17]",-1.0,-2.0
4588,2648,crashed,International,TTW Today's News: Bin Laden family plane crashed after 'avoiding microlight and landing too far down runway' http://t.co/BUMzvmwAM3,1,[],0.0,0.0
6554,4677,engulfed,"Rochester, NY",When Your Cake Is Engulfed In Flames #LiteraryCakes,0,[literarycakes],-1.0,-1.0


In [14]:
df_train[['target', 'hashtags_sentiment']].corr()


Unnamed: 0,target,hashtags_sentiment
target,1.0,0.42529
hashtags_sentiment,0.42529,1.0


**Conflusion: there is many hashtags used more often for disaster tweets than for normal tweets.**


In [15]:
df_hashtags.sample(n=10)

Unnamed: 0_level_0,cnt_positive,cnt_negative,all_count,positive_fact,sentiment,sentiment_adj
hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
incident,1,0,1,1.0,1.0,1.0
streetjamzdotnet,1,0,1,1.0,1.0,1.0
tb,0,1,1,0.0,-1.0,-1.0
greetingcards,0,1,1,0.0,-1.0,-1.0
lh_movie,0,1,1,0.0,-1.0,-1.0
allah,2,2,4,0.5,0.0,0.142023
marinelines,1,0,1,1.0,1.0,1.0
caraccidentlawyer,0,1,1,0.0,-1.0,-1.0
demolition,0,2,2,0.0,-1.0,-1.0
skardu,0,1,1,0.0,-1.0,-1.0


In [16]:
df_hashtags.to_csv('./hashtags_sentiment.csv')