# boeing-LCC-twitter-brand-listening
This application helps the LCC brands to listen the clients' feedback about their services via twitter sentiment analysis and keyword clustering

## Load GitHub

In [0]:
!git clone https://github.com/crosstreet74/boeing-LCC-twitter-brand-listening
%cd boeing-LCC-twitter-brand-listening
!ls

## crawling tweets

Crawl tweets at the time around each incident below
* United Airlines / T'way Air
* Boeing 737 NG (to be considered)
 * 2016년 3월 19일, 플라이두바이 981편 추락 사고, 탑승자 62명 전원 사망.
 * 2018년 4월 17일, 사우스웨스트 항공 1380편 엔진파손 사건, 탑승자 149명 중 1명 사망, 148명 생존.
 * 2019년 9월 경, FAA, Boeing 737 NG crack report
* Boeing 737 MAX
 * 2018년 10월 29일, 라이온 에어 소속 737 MAX 8이 바다로 추락, 189명 전원 사망
 * 2019년 3월 10일, 에티오피아 항공 소속 737 MAX 8 여객기 추락, 157명 전원 사망

GetOldtweets3
* TweetCriteria
 * lang=en, query('united airline 737 MAX'), since='2018-10-01', until='2019-11-30'

Less tweets for the korean search, chose english search

Removed url links in the tweets with manual edit

In [0]:
pip install GetOldTweets3

In [0]:
import time
import datetime
import pandas
import GetOldTweets3 as got
from tqdm import tqdm_notebook
from random import uniform

In [0]:
start = datetime.datetime.strptime("2018-10-01", "%Y-%m-%d")
end = datetime.datetime.strptime("2019-11-30", "%Y-%m-%d")
date_generated = [
    start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]

days_range = []

for date in date_generated:
    days_range.append(date.strftime("%Y-%m-%d"))

print("=== 설정된 트윗 수집 기간은 {} 에서 {} 까지 입니다 ===".format(
    days_range[0], days_range[-1]))
print("=== 총 {}일 간의 데이터 수집 ===".format(len(days_range)))

Takes a few hours to search & copy the tweets to the csv file

In [0]:
tweetCriteria = got.manager.TweetCriteria().setQuerySearch('united airline 737 MAX')\
                                           .setSince(days_range[0])\
                                           .setUntil(days_range[-1])\
                                           .setMaxTweets(-1)\
                                           .setLang('en') 

print("Collecting data start.. from {} to {}".format(
    days_range[0], days_range[-1]))

start_time = time.time()
tweet = got.manager.TweetManager.getTweets(tweetCriteria)

print("Collecting data end.. {0:0.2f} Minutes".format(
    (time.time() - start_time)/60))
print("=== Total # of tweets is {} ===".format(len(tweet)))

In [0]:
tweet_list = []

for index in tqdm_notebook(tweet):
    content = index.text
    tweet_date = index.date.strftime("%Y-%m-%d")
    info_list = [tweet_date, content]
    tweet_list.append(info_list)
    time.sleep(uniform(1, 3))

twitter_df = pandas.DataFrame(tweet_list,
                              columns=["date", "text"])

In [0]:
twitter_df.to_csv("crawled_tweets_{}_to_{}.csv".format(
    days_range[0], days_range[-1]), index=False)
print("=== {} tweets are successfully saved ===".format(len(tweet_list)))

## Tweets Sentiment Analysis

used negative, positive words from below
* https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html


In [0]:
import os
import nltk
import matplotlib.pyplot as plt
from collections import Counter
from datetime import date
nltk.download('punkt')

In [0]:
def words2lists(filename):
    lists = []
    file = open(filename, "r", encoding="utf-8")
    while True:
        line = file.readline().rstrip("\n")
        if line:
            lists.append(line)
        else:
            break
    return lists


def csv2list(filename):
    lists = []
    file = open(filename, "r", encoding="utf-8")
    while True:
        line = file.readline().rstrip("\n")
        if line:
            line = line.split("`")
            lists.append(line)
        else:
            break
    return lists

In [0]:
def sentimentalize(words):
    poslen = len(POSITIVE_WORDS.intersection(words))
    neglen = len(NEGATIVE_WORDS.intersection(words))

    if poslen > 0 and neglen == 0:
        return POSITIVE
    elif poslen == 0 and neglen > 0:
        return NEGATIVE
    elif poslen > 0 and neglen > 0:
        return BOTH
    else:
        return UNKNOWN


def count_sentiment(sentences):
    sents = Counter()
    words = Counter()

    for sentence in sentences:
        sentiment = sentimentalize(sentence)
        sents[sentiment] += 1
        words[sentiment] += len(sentence)

    return sents, words


def parse_sentiment(text):
    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]
    sents, words = count_sentiment(sentences)
    total = sum(words.values())

    for sentiment, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[sentiment]
        # print("{:0.3f}% {} ({} sentences)".format(pcent, sentiment, nsents))
    return sentiment


def parse_list(data):
    result = []
    for item in data:
        sentiment = parse_sentiment(item[1])
        # tweet_date = date.fromisoformat(item[0])
        result.append([
            # tweet_date.year, tweet_date.month, tweet_date.day,
            item[0], sentiment])
    return result

In [0]:
POSITIVE_WORDS_DIR = os.path.join('feature_words', 'positive-words.txt')
NEGATIVE_WORDS_DIR = os.path.join('feature_words', 'negative-words.txt')
CRAWLED_DATA_DIR = os.path.join('crawled_data.csv')
POSITIVE = 'positive'
NEGATIVE = 'negative'
UNKNOWN = 'unknown'
BOTH = 'both'
POSITIVE_WORDS = set(words2lists(POSITIVE_WORDS_DIR))
NEGATIVE_WORDS = set(words2lists(NEGATIVE_WORDS_DIR))

if __name__ == "__main__":
    results = []
    count_pos = 0
    count_neg = 0
    count_unkn = 0
    count_both = 0
    data = csv2list(CRAWLED_DATA_DIR)
    results = parse_data(data)

    for result in results:
        if(result[1] is 'positive'):
            count_pos += 1
        elif(result[1] is 'negative'):
            count_neg += 1
        elif(result[1] is 'unknown'):
            count_unkn += 1
        elif(result[1] is 'both'):
            count_both += 1

    print("positive tweets: {:0.3f}% ".format(count_pos/len(results)*100))
    print("negative tweets: {:0.3f}% ".format(count_neg/len(results)*100))
    print("unknown tweets: {:0.3f}% ".format(count_unkn/len(results)*100))
    print("both tweets: {:0.3f}% ".format(count_both/len(results)*100))

