# Hashtags analysis

In [161]:
import pandas as pd
import numpy as np
import re
import string

pd.set_option('display.max_colwidth', 150)

In [2]:
df_train = pd.read_csv('./train.csv')

### Hypotesis - specific tags are more frequent for disaster tweets

In [208]:
hashtag_pattern = re.compile('#\w+')

def extract_hashtags(text):
    tokens = re.findall(hashtag_pattern, text)
    found_hashtags = filter(lambda token: len(token) > 0 and token[0]=='#', tokens)
    found_hashtags = list(map(lambda tag: tag.lstrip('#').lower(), found_hashtags))
    return found_hashtags


hashtags_series = []
all_hashtags = set()
PUNCTUATION = string.punctuation
hashtags_positive = {}
hashtags_negative = {}
for _, row in df_train.iterrows():
    text = row['text']
    target = int(row['target'])
    found_hashtags = extract_hashtags(text)
    for h in found_hashtags:
        all_hashtags.add(h)
    hashtags_series.append(found_hashtags)
    output = hashtags_positive if target else hashtags_negative
    for hashtag in found_hashtags:
        if hashtag in output:
            output[hashtag] = output[hashtag] + 1
        else:
            output[hashtag] = 1

df_train['hashtags'] = hashtags_series


In [218]:
df_train[['text', 'hashtags', 'target']].sample(n=10)

Unnamed: 0,text,hashtags,target
2733,@MikeParrActor has confirmed on his twitter saying goodbye 2 ross. Am bloody gobsmacked/devastated #emmerdale,[emmerdale],0
2365,Shame how they took'em from being an intriguing dominant force to a jobbing C-list demolition https://t.co/1xSSvGIMvb,[],0
4600,@NEPD_Loyko Texans hope you are wrong. Radio in Houston have him as starter after Foster injury,[],0
926,White family (supposedly representing America's GREAT values ) gets blown up in a horrible CGI nuclear strike..... LMFAOOOO!!!!!!!!!!!!,[],1
4195,@DannyRaynard not bad personally I'd get rid of either hazard or aguero for a better striker than berahino,[],0
7499,I still need to finish the lover but I'm watching this other drama for 8 hours now and I'm an emotional wreck so the lover needs to wait,[],0
2343,Demolition Means Progress: Flint Michigan and the Fate of the American Metropolis Highsmith https://t.co/ZvoBMDxHGP,[],1
7483,Anyone know if Fox ÛÏNewsÛ will be live-streaming tonightÛªs Republican debate online? I want to watch the train wreck.,[],0
1441,@NorthBayHealth Trauma Center Shines In Response To Multi-Casualty Crash. http://t.co/21B6SKPDUR http://t.co/wBCb3sYtj7,[],1
722,@DarrylB1979 yea heard about that..not coming out until 2017 and 2019 ?????? Vampiro is bleeding,[],0


Lets calculate following metrics for hashcodes:
- `cnt_positive` - how many tweets labeled as disaster use the hash code.
- `cnt_negative` - how many tweets labeled as non-disaster use the hash code.
- `all_count` - all tweets having the hashcode.
- `positive_fact` - probability of using a hashtag used in disaster tweet.
- `negative_fact` - probability of using a hashtag in non-disaster tweet.
- `sentiment` - score [-1..1], scaled from `positive_fact` and an indication in which type of tweet the hashtag is used. Zero refers to no usage, or no bias.

In [219]:
# fill with zeros for not found hashtags
hashtag_counts = []
for h in all_hashtags:
    cnt_positive = hashtags_positive[h] if h in hashtags_positive else 0
    cnt_negative = hashtags_negative[h] if h in hashtags_negative else 0
    all_count = cnt_positive + cnt_negative
    positive_fact = cnt_positive / all_count if all_count > 0 else None
    sentiment = 2*(positive_fact - 0.5)
    hashtag_counts.append((h, cnt_positive, cnt_negative, all_count, positive_fact, sentiment))

df_hashtags = pd.DataFrame(data=hashtag_counts, columns=['hashtag', 'cnt_positive', 'cnt_negative', 'all_count', 'positive_fact', 'sentiment']).set_index('hashtag')
df_hashtags.sort_values('all_count', ascending=False, inplace=True)

#### Most specific hash tags (with absolut sentiment value >= 0.5)

In [220]:
search_query = 'all_count >= 5 and (abs(sentiment) >= 0.5)'
df_hashtags.query(search_query)

Unnamed: 0_level_0,cnt_positive,cnt_negative,all_count,positive_fact,sentiment
hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nowplaying,2,21,23,0.086957,-0.826087
hiroshima,22,0,22,1.000000,1.000000
earthquake,19,0,19,1.000000,1.000000
gbbo,4,14,18,0.222222,-0.555556
jobs,0,14,14,0.000000,-1.000000
...,...,...,...,...,...
phoenix,4,1,5,0.800000,0.600000
quote,1,4,5,0.200000,-0.600000
np,0,5,5,0.000000,-1.000000
science,5,0,5,1.000000,1.000000


#### How many tweets with most specific hash tags are used in?

In [221]:
query_cnt = df_hashtags.query(search_query)['all_count'].sum()
all_tweets = df_hashtags['all_count'].sum()
print(f'All tweets: {all_tweets}, tweets with relevant hashtags: {query_cnt} ({100*query_cnt/all_tweets:.0f}%)')

All tweets: 3330, tweets with relevant hashtags: 587 (18%)


## Correlation with the target

In [222]:
hashtags_score_result = []
for _, row in df_train.iterrows():
    hashtags = row['hashtags']
    values = []
    weights = []
    for h in hashtags:
        counts = df_hashtags.loc[h]['all_count']
        sentiment = df_hashtags.loc[h]['sentiment']
        values.append(sentiment)
        weights.append(counts)
    sentiment = np.average(values, weights=weights) if len(values) > 0 else .0
    hashtags_score_result.append(sentiment)
df_train['hashtags_sentiment'] = hashtags_score_result


In [223]:
df_train.sample(n=10)

Unnamed: 0,id,keyword,location,text,target,hashtags,hashtag_positive_score,hashtags_sentiment
2935,4218,drowned,Dubai,Migrants drown at sea after boat capsizes off #Libya http://t.co/t4pv0nrOoV http://t.co/PSeYLYzck4,1,[libya],1.0,1.0
314,457,armageddon,Canada,@ENews Ben Affleck......I know there's a wife/kids and other girls but I can't help it. I've loved him since Armageddon #eonlinechat,0,[eonlinechat],-1.0,-1.0
948,1374,blown%20up,,Turn on ESPN2 and get blown up,0,[],0.0,0.0
5061,7213,natural%20disaster,"Oneonta, NY/ Staten Island, NY",its only getting colder and colder and faster and faster and when i first realized it it was like a natural disaster,1,[],0.0,0.0
5570,7949,rainstorm,"Bridport, England",I want it to rainstorm PLEASE,0,[],0.0,0.0
6265,8952,storm,,kesabaran membuahkan hasil indah pada saat tepat! life isn't about waiting for the storm to pass it's about learning to dance in the rain.,1,[],0.0,0.0
7216,10335,weapons,ohio,@danagould @WaynesterAtl I agree with background checks. I just think guns or weapons in general are the great equalizer.,0,[],0.0,0.0
7519,10752,wreckage,Mumbai,Wreckage 'Conclusively Confirmed' as From MH370: Malaysia PM: Investigators and the families of those who were... http://t.co/4sf0rgn8Wo,1,[],0.0,0.0
5112,7291,nuclear%20disaster,,#Nuclear policy of #Japan without responsibility about Nuclear #Disaster will repeat same #failure.\n#annonymous #guardian #NYTimes #Reuters,1,"[nuclear, japan, disaster, failure, annonymous, guardian, nytimes, reuters]",7.555556,0.894737
3090,4436,electrocute,London,no but seriously I will electrocute half of UK Army's so I can touch bangtan i do not play games when it comes to bts,0,[],0.0,0.0


In [224]:
df_train[['target', 'hashtags_sentiment']].corr()


Unnamed: 0,target,hashtags_sentiment
target,1.0,0.423561
hashtags_sentiment,0.423561,1.0


**Conflusion: there is many hashtags used more often for disaster tweets than for normal tweets.**


In [225]:
df_hashtags.sample(n=10)

Unnamed: 0_level_0,cnt_positive,cnt_negative,all_count,positive_fact,sentiment
hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
vivaargentina,0,1,1,0.0,-1.0
tarzana,0,1,1,0.0,-1.0
ksbynews,1,0,1,1.0,1.0
guardian,2,0,2,1.0,1.0
chelsea,0,1,1,0.0,-1.0
evilempire,0,1,1,0.0,-1.0
climatechange,2,0,2,1.0,1.0
pbs,1,0,1,1.0,1.0
163,1,0,1,1.0,1.0
nå¼36,0,1,1,0.0,-1.0


In [226]:
df_hashtags.to_csv('./hashtags_sentiment.csv')