## Oppgave 1:

In [95]:
import nltk
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
import re
import string
import random

In [25]:
download_list = ["twitter_samples", "punkt", "wordnet", "averaged_perceptron_tagger", "stopwords"]

for item in download_list:
    if not nltk.download(item, quiet=True):
        print(f"Download of {item} failed!")

In [28]:
stop_words = stopwords.words('english')

In [93]:
print(type(twitter_samples), twitter_samples.__doc__)
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

<class 'nltk.corpus.reader.twitter.TwitterCorpusReader'> 
    Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.

    Individual Tweets can be tokenized using the default tokenizer, or by a
    custom tokenizer specified as a parameter to the constructor.

    Construct a new Tweet corpus reader for a set of documents
    located at the given root directory.

    If you made your own tweet collection in a directory called
    `twitter-files`, then you can initialise the reader as::

        from nltk.corpus import TwitterCorpusReader
        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')

    However, the recommended approach is to set the relevant directory as the
    value of the environmental variable `TWITTER`, and then invoke the reader
    as follows::

       root = os.environ['TWITTER']
       reader = TwitterCorpusReader(root, '.*\.json')

    If you want to work directly with the raw Tweets, the `json` library can


In [16]:
print(tweet_tokens[0], positive_tweets[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)'] #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


In [19]:
print(pos_tag(tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


In [32]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

print(" ".join(remove_noise(tweet_tokens[0], stop_words=stop_words)))
print(" ".join(remove_noise(tweet_tokens[0])))

#followfriday top engage member community week :)
#followfriday for be top engage member in my community this week :)


In [35]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = [remove_noise(tokens, stop_words) for tokens in positive_tweet_tokens]
negative_cleaned_tokens_list = [remove_noise(tokens, stop_words) for tokens in negative_tweet_tokens]

In [36]:
print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


In [39]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [45]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [47]:
positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [50]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9946666666666667
Most Informative Features
                      :( = True           Negati : Positi =   2085.9 : 1.0
                      :) = True           Positi : Negati =    970.4 : 1.0
                follower = True           Positi : Negati =     23.5 : 1.0
                  arrive = True           Positi : Negati =     20.0 : 1.0
                     sad = True           Negati : Positi =     19.1 : 1.0
                     x15 = True           Negati : Positi =     16.1 : 1.0
               community = True           Positi : Negati =     13.9 : 1.0
                 welcome = True           Positi : Negati =     12.2 : 1.0
                    sick = True           Negati : Positi =     12.2 : 1.0
                   didnt = True           Negati : Positi =     12.0 : 1.0
None


In [76]:
custom_tweet = "I ordered just once from TerribleCo, they were not bad, never used the app again."

custom_tokens = remove_noise(word_tokenize(custom_tweet))
print(custom_tokens)

print(classifier.classify(dict([token, True] for token in custom_tokens)))

['i', 'order', 'just', 'once', 'from', 'terribleco', 'they', 'be', 'not', 'bad', 'never', 'use', 'the', 'app', 'again']
Negative


Ting å se på:
Forstå naive bayes, og hvorfor "not happy" blir positivt. Teller den rekkefølge? Er alle ordene uavhengige av hverandre?  
Andre datasett, med andre sentiment enn positiv og negativ  
Inkludere tilfeller der teksten ikke er verken positiv eller negativ

## Utforsking av datasett med kjønn og tweets
https://www.kaggle.com/crowdflower/twitter-user-gender-classification  
Idé: Finne språklige mønster som går igjen hos menn og kvinner, og så se om disse går igjen i det forrige datasettet

In [78]:
import pandas as pd

In [84]:
gender_df = pd.read_csv("../data/gender-classifier-DFE-791531.csv", encoding = "latin1")
gender_df = gender_df[gender_df["gender:confidence"] >= 0.9][["text", "gender"]]
display(gender_df)

Unnamed: 0,text,gender
0,Robbie E Responds To Critics After Win Against...,male
1,ÛÏIt felt like they were my friends and I was...,male
3,Hi @JordanSpieth - Looking at the url - do you...,male
4,Watching Neighbours on Sky+ catching up with t...,female
5,"Ive seen people on the train with lamps, chair...",female
...,...,...
20044,Need A Ride Home From Practice _Ù÷Ô_Ù÷Ô_Ù÷ÔAnd...,female
20045,"@lookupondeath ...Fine, and I'll drink tea too...",female
20046,Greg Hardy you a good player and all but don't...,male
20047,You can miss people and still never want to se...,male


In [None]:
def remove_noise2(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [90]:
gender_tokens = gender_df.apply(lambda row: word_tokenize(row["text"]), axis=1)
gender_cleaned_tokens = [remove_noise(tokens, stop_words) for tokens in gender_tokens]
all_words = get_all_words(gender_cleaned_tokens)
freq_dist_pos = FreqDist(all_words)
print(freq_dist_pos.most_common(100))

[('http', 6088), ('get', 1716), ("'s", 1654), ('...', 1334), ("n't", 1210), ('weather', 1138), ('``', 796), ('go', 767), ("''", 765), ('like', 747), ("'m", 710), ('make', 622), ('one', 601), ('channel', 592), ('updates', 560), ('day', 534), ('love', 520), ('time', 497), ('new', 494), ('good', 454), ('see', 451), ('amp', 426), ('know', 418), ('people', 406), ('look', 391), ('say', 391), ('want', 373), ('best', 359), ('come', 359), ('think', 350), ('need', 332), ('take', 327), ('back', 327), ("'re", 321), ('work', 309), ('year', 304), ('last', 293), ('..', 293), ('thing', 276), ('great', 270), ("'ve", 265), ('would', 263), ('still', 262), ('today', 251), ('us', 249), ('life', 244), ('way', 243), ('watch', 240), ('via', 240), ('2', 239), ('week', 231), ("'ll", 230), ('follow', 230), ('ca', 229), ('try', 227), ('find', 220), ('game', 220), ('world', 219), ('could', 217), ('u', 214), ('really', 214), ('na', 214), ('right', 213), ('let', 212), ('give', 207), ('lol', 207), ('even', 205), ('fi

In [96]:
word_tokenize.__doc__

"\n    Return a tokenized copy of *text*,\n    using NLTK's recommended word tokenizer\n    (currently an improved :class:`.TreebankWordTokenizer`\n    along with :class:`.PunktSentenceTokenizer`\n    for the specified language).\n\n    :param text: text to split into words\n    :type text: str\n    :param language: the model name in the Punkt corpus\n    :type language: str\n    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.\n    :type preserve_line: bool\n    "