# Import libraries

In [1]:
import pandas as pd
import numpy as np
import random

# Set random seeds

In [2]:
def set_random_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)

set_random_seeds(42)

# Constants

In [3]:
REMOVE_RETWEETS = True
REMOVE_TWEETS_WITH_POS_AND_NEG_EMOTICONS = True
REMOVE_DUPLICATE_TWEETS = True
BASIC_COLUMN_HEADERS = ['polarity', 'tweet_id',
                        'date', 'query', 'user', 'content']

TRAINDATA_PATH = 'data/training.1600000.processed.noemoticon_utf8.csv'


# Load data

In [4]:
data = pd.read_csv(TRAINDATA_PATH, header=None)
data.columns = BASIC_COLUMN_HEADERS

# Remove Tweets

## Remove Retweets

In [5]:
def removeRetweets(data):
    for i, content in enumerate(data['content']):
        if content[0:2].lower() == 'rt':
            data = data.drop(index=i)
    return data


if (REMOVE_RETWEETS):
    data = removeRetweets(data)
    print('Datensatzgröße nach Entfernen: %d' % len(data))

Datensatzgröße nach Entfernen: 1599840


## Remove tweets with negative and positive emoticons, also remove tweets with ':P' (following Alec Go et al.) 

In [6]:
def removeTweetsWithPosAndNegEmoticons(data):
    for i, content in enumerate(data['content']):
        if (':(' in content and ':)' in content) or ('(:' in content and '):' in content) or (':P' in content):
            data = data.drop(index=i)
    return data


if (REMOVE_TWEETS_WITH_POS_AND_NEG_EMOTICONS):
    data = removeTweetsWithPosAndNegEmoticons(data)
    print('Datensatzgröße nach Entfernen: %d' % len(data))

Datensatzgröße nach Entfernen: 1599838


## Remove duplicates

### by ID

In [7]:
if REMOVE_DUPLICATE_TWEETS:
    df_value_counts = data['tweet_id'].value_counts().to_frame().reset_index()
    df_value_counts.columns = ['unique_values', 'count']

    #--REMOVE DUPLICATE TWEETS BY ID

    print("Es gibt 1.685 doppelte Tweets (tweet_id):")
    print(df_value_counts['count'].value_counts())

    print("Es gibt 18.532 doppelte Tweets (content):")
    print(data[data.duplicated(['content'])].shape)

    #example of same content tweet
    print(data.loc[data['content'] ==
                   'getting used to twitter ', ['user', 'content']])

    duplicates = data.loc[data['tweet_id'] == 2190457769]

    #Problem: zwei gleiche tweets werden unterschiedlich gelabeled
    print(duplicates[['polarity', 'content']])

    #find all duplicates that have the above problem
    duplicatesById = data[data.duplicated(['tweet_id'])]
    dataGroupedById = data.loc[:, ['polarity', 'tweet_id', 'content']].groupby([
        'tweet_id'])

    #count all duplicates (tweet_id) having a different polarity
    count = 0
    for tweet_id in duplicatesById['tweet_id']:
        group = dataGroupedById.get_group(tweet_id)
        if group['polarity'].agg(np.mean) != 0 and group['polarity'].agg(np.mean) != 4:
            count += 1

    print('%d von %d Duplikaten haben unterschiediche Polarität.' %
          (count, len(duplicatesById)))

    print('Aktuelle Größe des Datensatzes: %d' % (len(data)))
    print('Lösche alle %d Duplikate...' % (count * 2))
    #delete them
    data = data[~data['tweet_id'].isin(duplicatesById['tweet_id'])]
    print('Aktuelle Größe des Datensatzes: %d' % (len(data)))


Es gibt 1.685 doppelte Tweets (tweet_id):
1    1596468
2       1685
Name: count, dtype: int64
Es gibt 18.532 doppelte Tweets (content):
(18532, 6)
                    user                   content
974898   djtwistedvision  getting used to twitter 
1536379         Kimandra  getting used to twitter 
1599501          _cammi_  getting used to twitter 
         polarity                                            content
513734          0  @berntina I know I thought it was great.. 3 te...
1583635         4  @berntina I know I thought it was great.. 3 te...
1685 von 1685 Duplikaten haben unterschiediche Polarität.
Aktuelle Größe des Datensatzes: 1599838
Lösche alle 3370 Duplikate...
Aktuelle Größe des Datensatzes: 1596468


## by content

In [8]:
def most_frequent(List):
    counter = 0
    num = List[0]

    for i in List:
        curr_frequency = List.count(i)
        if (curr_frequency > counter):
            counter = curr_frequency
            num = i

    return num


if REMOVE_DUPLICATE_TWEETS:
    #REMOVE DUPLICATES BY CONTENT
    #after deleting duplicates by id there are 23596 duplicates by content
    #all tweets whose content appear more than once in the dataset
    duplicatesByContent = data[data.duplicated(['content'], keep=False)]
    duplicatesByContentGrouped = duplicatesByContent.groupby(
        ['content'])  # grouped by their content

    #check for different polarity
    counter = 0
    one_sample_per_group_list = pd.DataFrame(columns=data.columns)

    for group_name, group in duplicatesByContentGrouped:
        agg_polarity = group['polarity'].agg(np.mean)
        modus_polarity = most_frequent(group['polarity'].tolist())
        if agg_polarity != 0 and agg_polarity != 4:
            counter += 1
        keeped_element = group.head(1)
        keeped_element['polarity'] = modus_polarity
        one_sample_per_group_list = one_sample_per_group_list.append(
            keeped_element)

    print('Anzahl Duplikate insgesamt: %d' % len(duplicatesByContent))
    print('anzahl gruppen: %d' % len(duplicatesByContentGrouped.groups))
    print('Anzahl Gruppen mit interner unterschiedlicher Polarität: %d' % (counter))
    print('Aktuelle Größe des Datensatzes: %d' % (len(data)))

    print('Lösche alle Duplikate...')
    #delete all duplicates by content
    data = data.drop_duplicates(subset=['content'], keep=False)
    print('Aktuelle Größe des Datensatzes: %d' % (len(data)))
    #add one representative sample for each group again
    print('Einen Vertreter jeder Gruppe wieder hinzufügen...')
    data = data.append(one_sample_per_group_list)
    print('Aktuelle Größe des Datensatzes: %d' % (len(data)))

  for group_name, group in duplicatesByContentGrouped:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  keeped_element['polarity'] = modus_polarity
  one_sample_per_group_list = one_sample_per_group_list.append(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  keeped_element['polarity'] = modus_polarity
  one_sample_per_group_list = one_sample_per_group_list.append(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs

Anzahl Duplikate insgesamt: 23592
anzahl gruppen: 6758
Anzahl Gruppen mit interner unterschiedlicher Polarität: 547
Aktuelle Größe des Datensatzes: 1596468
Lösche alle Duplikate...
Aktuelle Größe des Datensatzes: 1572876
Einen Vertreter jeder Gruppe wieder hinzufügen...
Aktuelle Größe des Datensatzes: 1579634


  data = data.append(one_sample_per_group_list)


# Drop unimportant columns

In [9]:
data = data.drop(['tweet_id', 'date', 'query', 'user'], axis=1)
data['polarity'] = data['polarity'].astype('int64')

# Save to disk

In [10]:
data.to_csv("data/train_tweets_removed.csv", index=False, encoding='utf-8')
print("Datensatz der Größe %d gespeichert." % len(data))

Datensatz der Größe 1579634 gespeichert.


In [11]:
data = pd.read_csv("data/train_tweets_removed.csv")

In [12]:
data[101:103]


Unnamed: 0,polarity,content
101,0,Behind on my classes for work
102,0,watching &quot;House&quot;
