In [1]:
import re
from nltk.corpus import stopwords
import numpy as np
import pandas as pd

In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [3]:
def remove_stop(str):
    stop = set(stopwords.words('english'))
    lst = str.split(" ")
    lst = [i for i in lst if i not in stop]
    return ' '.join(lst)

In [4]:
def remove_unwanted_words(str):
    unwanted_words = ["httpaddress", "usrid", "dd", "rt", "amp", "pm", " ", "'s", "n't", "\t", '``', "''", "", "//", "\\", "\\'s", "\\?"]
    lst = str.split(" ")
    lst = [i for i in lst if i not in unwanted_words]
    return ' '.join(lst)

In [5]:
def toLower(str):
    lst = str.split(" ")
    lst = [i.lower() for i in lst]
    return ' '.join(lst)

In [6]:
def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity african_data from files, splits the african_data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load african_data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    positive_examples = [remove_stop(item) for item in positive_examples]
    positive_examples = [toLower(item) for item in positive_examples]
    positive_examples = [remove_unwanted_words(item) for item in positive_examples]
    
    positive_examples = [clean_str(sent) for sent in positive_examples]
    for i in positive_examples:
        if len(i.split(" ")) < 3:
            positive_examples.remove(i)
    positive_examples = list(filter(None, positive_examples) )    
    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    negative_examples = [toLower(item) for item in negative_examples]
    negative_examples = [remove_stop(item) for item in negative_examples]
    negative_examples = [remove_unwanted_words(item) for item in negative_examples]
    negative_examples = list(filter(None, negative_examples) )
    negative_examples = [clean_str(sent) for sent in negative_examples]
    for i in negative_examples:
        if len(i.split(" ")) < 3:
            negative_examples.remove(i)
    negative_examples = list(filter(None, negative_examples) )    
    
    # Split by words
    x_text = positive_examples + negative_examples
   # x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)

    return [x_text, y]

In [11]:
CF_pos_prccd2 = 'Data/turkish_protest_test_pos_prccd2.txt'
CF_neg_prccd2 = 'Data/turkish_protest_test_neg_prccd2.txt'

In [12]:
x_test, y = load_data_and_labels(CF_pos_prccd2, CF_neg_prccd2)

In [13]:
len(x_test)

463

In [14]:
len(y)

463

In [61]:
import random
from sklearn.utils import shuffle
from scipy.sparse import coo_matrix
import pandas as  pd

y = [[1,0],[1,0],[0,1]]
x = ['a','b','c']

df1 = pd.DataFrame({'text': x,'label': y})
df1

Unnamed: 0,label,text
0,"[1, 0]",a
1,"[1, 0]",b
2,"[0, 1]",c


In [67]:
df2 = df1.reindex(np.random.permutation(df1.index))

In [68]:
df2[]

Unnamed: 0,label,text
1,"[1, 0]",b
2,"[0, 1]",c
0,"[1, 0]",a


In [74]:
l = list(df2(0))

TypeError: 'DataFrame' object is not callable

In [70]:
l

[[1, 0], [0, 1], [1, 0]]

In [71]:
t = list(df2.text)
t

['b', 'c', 'a']

In [72]:
df1.index

RangeIndex(start=0, stop=3, step=1)

In [76]:
import pandas as pd

df1 = pd.DataFrame({
    'A': [1,2,3,4,5],
    'B': [1,2,3,4,5]
})

df2 = pd.DataFrame({
    'C': [1,2,3,4,5],
    'D': [1,2,3,4,5]
})

df_concat = pd.concat([df1, df2], axis=1)

print(df_concat)


   A  B  C  D
0  1  1  1  1
1  2  2  2  2
2  3  3  3  3
3  4  4  4  4
4  5  5  5  5


In [178]:
l1 = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n"]
l2 = [[1,0], [2,0], [0,3],[0,4],[0,5],[0,6],[0,7],[0,8],[0,9],[0,10],[0,11],[0,12],[0,13],[0,14]]
l3 = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
l = [l1,l2],[]

#random.Random(2).shuffle(l1)
#print (l1)
#random.Random(2).shuffle(l2)
#print (l2)

In [179]:
random.sample(l1, 5)

['a', 'c', 'd', 'h', 'e']

In [180]:
random.sample(l2, 5)

[[0, 13], [0, 9], [0, 12], [0, 11], [0, 6]]

In [217]:
shuffle_indices = list(np.random.permutation(np.arange(300)))

In [218]:
shuffle_indices

[223,
 3,
 125,
 162,
 56,
 20,
 11,
 173,
 198,
 119,
 172,
 259,
 12,
 101,
 122,
 194,
 17,
 135,
 99,
 151,
 262,
 287,
 297,
 290,
 261,
 249,
 189,
 118,
 8,
 248,
 211,
 21,
 175,
 80,
 270,
 216,
 110,
 82,
 177,
 191,
 218,
 66,
 225,
 160,
 6,
 281,
 176,
 253,
 70,
 134,
 104,
 35,
 251,
 260,
 105,
 214,
 124,
 184,
 215,
 235,
 147,
 139,
 197,
 81,
 14,
 103,
 252,
 296,
 52,
 165,
 13,
 92,
 159,
 24,
 34,
 121,
 90,
 116,
 89,
 166,
 163,
 78,
 178,
 45,
 240,
 258,
 171,
 58,
 128,
 286,
 233,
 150,
 182,
 201,
 278,
 33,
 196,
 85,
 84,
 228,
 192,
 275,
 231,
 132,
 291,
 126,
 221,
 169,
 284,
 263,
 145,
 48,
 186,
 149,
 208,
 91,
 288,
 268,
 96,
 207,
 185,
 264,
 273,
 133,
 86,
 107,
 170,
 190,
 64,
 37,
 283,
 106,
 298,
 9,
 202,
 213,
 32,
 129,
 241,
 229,
 50,
 203,
 76,
 267,
 230,
 113,
 217,
 5,
 36,
 39,
 115,
 242,
 100,
 79,
 15,
 95,
 1,
 138,
 57,
 51,
 294,
 68,
 222,
 236,
 2,
 73,
 22,
 72,
 46,
 161,
 250,
 204,
 247,
 205,
 61,
 23,
 155,
 2

In [183]:
x = (len(shuffle_indices) * 20 // 100)
print(x)

2


In [184]:
shuffle_indices = random.sample(shuffle_indices,x)

In [185]:
shuffle_indices

[11, 5]

In [186]:
l1 = l1[shuffle_indices]

TypeError: list indices must be integers or slices, not list

In [187]:
l1 = [l1[i] for i in shuffle_indices]
l2 = [l2[i] for i in shuffle_indices]

In [188]:
l1

['l', 'f']

In [189]:
l2

[[0, 12], [0, 6]]

In [193]:
def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

In [195]:
n = [1,2,3,5,8,10,3]
mean(n)

4.571428571428571