In [1]:
import csv
import time

from textblob import TextBlob
import nltk
from nltk.stem import WordNetLemmatizer 

from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/bence/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Get stopwords
stopwords_eng = stopwords.words('english')
custom_stopwords = ['\'s', '\'ll','br', '1']
# numbers = [i for i in range(10)] 
for word in custom_stopwords:
    stopwords_eng.append(word)

# stopwords_eng

In [3]:
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

In [4]:
def read_csv_data(filename):
    # Read the file line by line
    data_file = open(filename, 'r')
    lines = data_file.readlines()
    
    data = []
    with open(filename, newline='\n') as data_file:
        reader = csv.reader(data_file, delimiter=',')
        i = 0
        for row in reader:
            i += 1
            if i == 1:
                continue

            data.append(row)
#     print("Number of rows in dataset = %d" % (i - 1))
    return data

In [5]:
def build_lexicon_from_csv(filename, nr_of_rows_to_process):
    start_time = time.time()
    
    data = read_csv_data(filename)
    words_dict = {}
    
    i = 0
    for row in data:
        
        if i % 1000 == 0:
            print("Processing row %d" % i)
        
        text = TextBlob(row[0]).lower().words
        polarity = row[1]

        # filter stop words
        filtered_text = [word for word in text if not word in stopwords_eng]

        # lemmatize the text
        lemmatized_filtered_text = TextBlob(lemmatize_with_postag(" ".join(filtered_text)))
  
        for word in lemmatized_filtered_text.words:
            # word is not found in the dictionary
            if word not in words_dict:
                if polarity == 'positive':
                    words_dict[word] = {
                        "pos_count": 1,
                        "neg_count": 0
                    }
                elif polarity == 'negative':
                    words_dict[word] = {
                        "pos_count": 0,
                        "neg_count": 1
                    }
            # word is found in the dictionary
            else:
                if polarity == 'positive':
                    words_dict[word]['pos_count'] +=1
                elif polarity == 'negative':
                    words_dict[word]['neg_count'] +=1
        
        if i == nr_of_rows_to_process:
            print("Processed row %d" % i)
            break
            
        i+=1
                    
    # calculate the polarity of the words
    nr_of_words = 0
    for key, value in words_dict.items():
        neg_count = value['neg_count']
        pos_count = value['pos_count']
        words_dict[key]['polarity'] = pos_count / (neg_count + pos_count)
        
        nr_of_words += 1
        
    print("There are %d words in this sentiment lexicon." % nr_of_words)
    
    finish_time = time.time()
    exec_time = finish_time - start_time
    print("Executed in %s seconds" % exec_time)
    
    return words_dict

In [6]:
# sentiment_lexicon = build_lexicon_from_csv('IMDB_data_first100row')
sentiment_lexicon = build_lexicon_from_csv('IMDB Dataset.csv', 48000)

Processing row 0
Processing row 1000
Processing row 2000
Processing row 3000
Processing row 4000
Processing row 5000
Processing row 6000
Processing row 7000
Processing row 8000
Processing row 9000
Processing row 10000
Processing row 11000
Processing row 12000
Processing row 13000
Processing row 14000
Processing row 15000
Processing row 16000
Processing row 17000
Processing row 18000
Processing row 19000
Processing row 20000
Processing row 21000
Processing row 22000
Processing row 23000
Processing row 24000
Processing row 25000
Processing row 26000
Processing row 27000
Processing row 28000
Processing row 29000
Processing row 30000
Processing row 31000
Processing row 32000
Processing row 33000
Processing row 34000
Processing row 35000
Processing row 36000
Processing row 37000
Processing row 38000
Processing row 39000
Processing row 40000
Processing row 41000
Processing row 42000
Processing row 43000
Processing row 44000
Processing row 45000
Processing row 46000
Processing row 47000
Proce

## Polarity explainer

* 1.0  -> positive
* 0.5 -> neutral
* 0.0  -> negative

In [7]:
sentiment_lexicon

{'one': {'neg_count': 25330,
  'polarity': 0.5119836621454994,
  'pos_count': 26574},
 'reviewer': {'neg_count': 501,
  'polarity': 0.44884488448844884,
  'pos_count': 408},
 'mention': {'neg_count': 1641,
  'polarity': 0.4256212810640532,
  'pos_count': 1216},
 'watch': {'neg_count': 13538,
  'polarity': 0.46407505641106844,
  'pos_count': 11723},
 'oz': {'neg_count': 83, 'polarity': 0.6807692307692308, 'pos_count': 177},
 'episode': {'neg_count': 1613,
  'polarity': 0.63013070396698,
  'pos_count': 2748},
 'hook': {'neg_count': 187, 'polarity': 0.5862831858407079, 'pos_count': 265},
 'right': {'neg_count': 3188,
  'polarity': 0.5124636794616914,
  'pos_count': 3351},
 'exactly': {'neg_count': 1005,
  'polarity': 0.46712619300106045,
  'pos_count': 881},
 'happen': {'neg_count': 3488,
  'polarity': 0.44200927851543753,
  'pos_count': 2763},
 'first': {'neg_count': 7861,
  'polarity': 0.5240949267465795,
  'pos_count': 8657},
 'thing': {'neg_count': 9040,
  'polarity': 0.42795671707903

# Testing the generated sentiment lexicon

In [8]:
def get_single_text_polarity(input_data, input_idx, sentiment_lexicon):
    
    text = TextBlob(input_data[input_idx][0]).lower().words

    # filter stop words
    filtered_text = [word for word in text if not word in stopwords_eng]

    # lemmatize the text
    lemmatized_filtered_text = TextBlob(lemmatize_with_postag(" ".join(filtered_text)))
    
    polarity_sum = 0
    word_count = 0
    for word in lemmatized_filtered_text.words:
        if word in sentiment_lexicon:
            polarity = sentiment_lexicon[word]['polarity']
            polarity_sum += polarity
            word_count += 1

    text_polarity = polarity_sum / word_count
#     print("Text polarity: %.2f" % text_polarity)
    return text_polarity

In [10]:
def calculate_accuracy(test_data, sentiment_lexicon):
    polarity_match_count = 0
    idx = 0
    for elem in test_data:
        calculated_polarity = get_single_text_polarity(test_data, idx, sentiment_lexicon)
        polarity_str = elem[1]

        if polarity_str == 'negative':
            polarity = 0
        else:
            polarity = 1

        if calculated_polarity > 0.5:
            predicted_polarity = 1
            if predicted_polarity == polarity:
                polarity_match_count += 1
        elif calculated_polarity < 0.5:
            predicted_polarity = 0
            if predicted_polarity == polarity:
                polarity_match_count += 1

        idx += 1
        
        acc = polarity_match_count / len(test_data)
    print("Accuracy: %.2f%% \n  %d / %d = %.2f" % (acc*100, polarity_match_count, len(test_data), acc) )
    
    return acc

In [11]:
test_data = read_csv_data('IMDB_data_last_2000rows')
calculate_accuracy(test_data, sentiment_lexicon)

Accuracy: 85.00% 
  1700 / 2000 = 0.85


0.85

In [38]:
def get_text_polarity(input_data, sentiment_lexicon):
    text = TextBlob(input_data).lower().words

    # filter stop words
    filtered_text = [word for word in text if not word in stopwords_eng]

    # lemmatize the text
    lemmatized_filtered_text = TextBlob(lemmatize_with_postag(" ".join(filtered_text)))
    
    polarity_sum = 0
    word_count = 0
    for word in lemmatized_filtered_text.words:
        if word in sentiment_lexicon:
            polarity = sentiment_lexicon[word]['polarity']
            polarity_sum += polarity
            word_count += 1
            
    if word_count == 0:
        return 0.5

    text_polarity = polarity_sum / word_count
#     print("Text polarity: %.2f" % text_polarity)
    return text_polarity

In [92]:
some_texts = [
    'That was an interesting assignment, I think the generated sentiment lexicon is slightly racist and has a lot of prejudice.',
    'That was a fun assignment, I think the generated sentiment lexicon is slightly racist and has a lot of prejudice.',
    'That was a fun assignment, I think the generated lexicon is slightly racist and has a lot of prejudice.',
    'That was a fun assignment, I think the generated lexicon is slightly racist.',
    'Black',
    'White',
    'Latino', # yeah, this is the racist part (polarity is 0.2753)
    'Alright',
    'Right',
    'Perfect',
    'imperfect',
    'terrible',
    'An absolutely terrible movie.',
    'terrific',
    'A terrific performance.',
    'virus',
    'antivirus',
    'prejudice',
    'racism',
    'racist',
    'university',
    'computer',
    'science',
    'language',
    'natural language processing',
    'The perfect example.',
    'This is the perfect example, but it cannot get a perfect score.',
    'This is the perfect example, but it can\'t get a perfect score.',
    'This is the perfect example, it can get a perfect score.',
    'This is the worst example, it can\'t get a perfect score.',
    'This is the worst example, it\'s going to get a terrible score.',
    'The sky is blue',
    'The sun is shining.',
    'The sun is shining. This should get a very positive score.',
    'Human emotions are complicated.',
    'Human emotions are complicated for computers.',
    'Human emotions are complicated for humans',
    'Beauty doesn\'t rhyme well with an empty mind.',
    'Beauty does rhyme well with an empty mind.',
    'To be or not to be', # I guess every word is a stopword in this sentence
    'Comparison is the thief of self worth.',
    
    # and now try some real (Tesla V100) reviews 
    'Very nice upgrade from my Nvidia Riva TNT. Roller Coaster Tycoon ran at 3000fps at 360p.'
    'Solitaire at max settings ran at an outstanding 6000fps. Unfortunately my monitor had massive screen'
    'tearing and I misplayed as the mouse was not keeping up. The cards flickered like a candle as they'
    'floated across the screen. My CRT monitor glass turned black from the unbelievable frame rate before'
    'bursting into flames. I then upgraded my monitor to 240hz and you would not believe it, I can now see'
    'the air molecules pushing back on the cards as I move them from the Tableaux to their foundation homes.'
    'Words cannot describe the winning cascade. It was visually stunning; more so than Iguazu Falls itself.'
    'Simply tremendous! This GPU is the future of Solitaire and I will be recommending it at worlds in Helsinki. ',
    'You don\'t need a monitor.'
    'I was able to finally use those $1,800.00 HDMI Gold cables and plug them directly into my cerebral cortex. ',
    
    # and Rotten Tomatoes reviews (for Whiplash)
    'It all ends on a triumphantly upbeat note that has the intensity to completely'
    'dominate every annoyance and dubious direction that came before it.',
    
    'What makes musical genius? Is it inherent, or does it have to be dragged'
    'kicking, screaming and bleeding from you? ' # that was a 4.5/5 review but it got a 'negative' polarity (0.4846)
]

for text in some_texts:
    print("%.4f -> %s" % (get_text_polarity(text, sentiment_lexicon), text))

0.5322 -> That was an interesting assignment, I think the generated sentiment lexicon is slightly racist and has a lot of prejudice.
0.5480 -> That was a fun assignment, I think the generated sentiment lexicon is slightly racist and has a lot of prejudice.
0.5518 -> That was a fun assignment, I think the generated lexicon is slightly racist and has a lot of prejudice.
0.5424 -> That was a fun assignment, I think the generated lexicon is slightly racist.
0.5021 -> Black
0.5103 -> White
0.2754 -> Latino
0.2451 -> Alright
0.5125 -> Right
0.7717 -> Perfect
0.8000 -> imperfect
0.1270 -> terrible
0.3321 -> An absolutely terrible movie.
0.8172 -> terrific
0.7422 -> A terrific performance.
0.2675 -> virus
1.0000 -> antivirus
0.6461 -> prejudice
0.4894 -> racism
0.3588 -> racist
0.4958 -> university
0.3626 -> computer
0.4502 -> science
0.5906 -> language
0.5943 -> natural language processing
0.6056 -> The perfect example.
0.6105 -> This is the perfect example, but it cannot get a perfect score.