In [1]:
import pandas, numpy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords

In [2]:
df = pandas.read_csv('../../data/dataset_old.csv')
stop_words = set(stopwords.words('english'))

In [7]:
def load_subjectivity_lexicon():
    posUrl, negUrl = "resources/sentiment_lexicon/positive-words.txt", "resources/sentiment_lexicon/negative-words.txt"
    with open(posUrl, "r") as posFile, open(negUrl, "r") as negFile:
        posText, negText = posFile.read(), negFile.read()
        posLines, negLines = posText.split("\n"), negText.split("\n")
        return posLines[31:], negLines[31:]

In [8]:
positive_tokens, negative_tokens = load_subjectivity_lexicon()

In [3]:
def most_common_words(docs, stop_words=None, n=1000):
    count_vec = CountVectorizer(max_features=100, stop_words=stop_words)
    doc_term_mat = count_vec.fit_transform(docs)
    return pandas.DataFrame(columns=['word', 'freq'], data={'word': count_vec.get_feature_names(), 'freq': numpy.array(doc_term_mat.sum(axis=0))[0]}).sort_values(by='freq', ascending=False)

In [9]:
common_words = most_common_words(df.NormalizedMessage.astype(str), stop_words=stop_words)

In [14]:
def most_common_positive_words(docs, stop_words=None, n=1000):
    count_vec = CountVectorizer(max_features=100, stop_words=stop_words, vocabulary=dict(zip(positive_tokens, list(range(len(positive_tokens))))))
    doc_term_mat = count_vec.fit_transform(docs)
    return pandas.DataFrame(columns=['word', 'freq'], data={'word': count_vec.get_feature_names(), 'freq': numpy.array(doc_term_mat.sum(axis=0))[0]}).sort_values(by='freq', ascending=False)

def most_common_negative_words(docs, stop_words=None, n=1000):
    count_vec = CountVectorizer(max_features=100, stop_words=stop_words, vocabulary=dict(zip(negative_tokens, list(range(len(negative_tokens))))))
    doc_term_mat = count_vec.fit_transform(docs)
    return pandas.DataFrame(columns=['word', 'freq'], data={'word': count_vec.get_feature_names(), 'freq': numpy.array(doc_term_mat.sum(axis=0))[0]}).sort_values(by='freq', ascending=False)


In [15]:
common_negative_words=most_common_negative_words(df.NormalizedMessage.astype(str), stop_words=stop_words)

In [19]:
cvec = CountVectorizer(vocabulary=dict(zip(negative_tokens, list(range(len(negative_tokens))))))
doc_term_mat = cvec.fit_transform(df.NormalizedMessage.astype(str))

In [34]:
len(numpy.array(doc_term_mat.sum(axis=1).T)[0])

243930

In [39]:
def count_neg_words_by_tweet(docs):
    cvec = CountVectorizer(vocabulary=dict(zip(negative_tokens, list(range(len(negative_tokens))))))
    doc_term_mat = cvec.fit_transform(docs)
    return pandas.DataFrame(columns=['tweet', 'freq'], data={'tweet': docs, 'freq':numpy.array(doc_term_mat.sum(axis=1).T)[0]}).sort_values(by='freq', ascending=False)

In [40]:
neg = count_neg_words_by_tweet(df.NormalizedMessage.astype(str))

In [49]:
neg[neg.freq>0]

Unnamed: 0,tweet,freq
186131,dear dawson kenneth burns janice kilgore ray m...,89
133629,navy federal messed up they have caused me a l...,63
133628,navy federal messed up they have caused me a l...,63
133626,navy federal messed up they have caused me a l...,63
187711,the seattle seahawks have decided as an organi...,58
...,...,...
18907,literally it seems the more i try and handle m...,1
18875,this is beyond ridiculous understand my accoun...,1
4076,about to get sued,1
42747,the common courtesy of calling her this is rid...,1


In [54]:
neg.tweet[187711]

'the seattle seahawks have decided as an organization to exercise their freedom to not participate with their fellow americans in honoring our flag while the national anthem is played as a retired naval officer with over years of active duty i swore an oath to protect and defend their freedom to make that choice however being acutely aware of the sacrifice made to secure that freedom i am deeply saddened that the seahawks franchise somehow felt that ignoring disrespecting or dishonoring our national symbols flag anthem is appropriate even in heartfelt protest those of us who wear america s uniform instead of rivaling team uniforms and proudly serve her for subsistence level pay rather than multi million dollar paychecks who proudly stand salute our flag every morning as the anthem plays and every evening as taps plays who really understand the true cost of freedom find it difficult to comprehend the sad attempt to co opt and cheapen our national symbols by those who have never come clo

<b>TODO:</b> 

- Use terms like thanks, thank you, please, need, help, great, service, love, etc to train classifiers to determine if a tweet is a request for help (help for what?), a complaint (about what?), a satisfied customer review (of what service or product?)

- Write classifiers to determine if tweets mention each product or service to visualize the following:
    - Of requests for help, what percent are for help that mention each product or service as determined by the product/service topic classifier?
        - What percent got answered?
    - Of complaints, what percent are complaining about each product or service as determined by the product/service topic classifier?
        - What percent of complaints were addressed/responded to?
        - How many followers do they have?
    - Of satisfied reviews, what percent are praising each product or service as determined by the product/service topic classifier? 
        - How many followers do they have?

- Complaints:
    - Sentiment Classifier
        - Rule based sentiment classifier to label positive, negative, or neutral tweets
            - Initially explore by using raw counts of positive/negative/neutral words in each tweet
            - Explore more complex methods
                - VADER?
                - Custom sentiment score?
        - Supervised classifier to determine if a tweet is positive, negative or neutral
            - Find likely positives/negatives/neutrals based on pos/neg/neutral word counts
            - Manually label likely candidates for pos/neg/neutral
            - Train a supervised classifier on the labeled tweets to predict sentiment (positive, negative, neutral)
    - Complaint Classifier
        - Manually label some that were ruled by the sentiment classifier to be positive, some negative, some neutral
        - (Naive Bayes?) supervised classifier to label tweets as complaints
        - Ideas to identify potential complaints to label by:
            - Find tweets with high numbers of negative words
            - Find tweets with an abnormally high number of negative words relative to the rest
            - Find tweets with high ratios of negative words to positive
            - Find the most common negative words
                - Find tweets with the highest numbers of the most common negative words
                - Find tweets with abnormally high numbers of most common negative words
                - Find tweets with high ratios of negative words to most common positive
    - Complaint Clustering
        - KMeans? to group complaints with similar words
    - Topic Extraction
        - Figure out what people are complaining about by extracting topics from complaints
- Help Requests:
    - 

- Complaint Classifier
    - Manually label some that were ruled by the sentiment classifier to be positive, some negative, some neutral
    - (Naive Bayes?) supervised classifier to label tweets as complaints
    - Ideas to identify potential complaints to label by:
        - Find tweets with high numbers of negative words
        - Find tweets with an abnormally high number of negative words relative to the rest
        - Find tweets with high ratios of negative words to positive
        - Find the most common negative words
            - Find tweets with the highest numbers of the most common negative words
            - Find tweets with abnormally high numbers of most common negative words
            - Find tweets with high ratios of negative words to most common positive

- Review rating classifier
    - Train on (amazon? movie?) reviews with labels being the rating (1-5 stars?)
    - Use classifier to classify each tweet about NFCU from 1-5 stars
    - 1 stars are likely to be complaints, 5 stars are likely to be praise
        - automatically label 1 stars as complaints and 5 stars as praise
        - manually verify a number of the labeled tweets and train a classifier to predict whether a tweet is a complaint or praise

In [55]:
from nltk import imdb

ImportError: cannot import name 'imdb' from 'nltk' (c:\users\swein\appdata\local\programs\python\python38-32\lib\site-packages\nltk\__init__.py)