## Text Preprocessing

This notebook is to process and clean 1000 lines of blog data.

***

In [None]:
# Initialize NLTK
# import nltk
# nltk.download('popular')

In [None]:
#nltk.__version__

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import matplotlib.pyplot as plt
import wordcloud
from wordcloud import WordCloud, ImageColorGenerator
from collections import OrderedDict
from ast import literal_eval

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer
from nltk.probability import FreqDist
from nltk.util import bigrams, ngrams, trigrams

pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
%matplotlib inline

In [2]:
df = pd.read_csv('blogdata_1000.csv')

In [3]:
df.head()

Unnamed: 0,words
0,"We have given our hearts away, a sordid boon !”"
1,1. Start it on the side
2,"Sugar’s sweet, so is she,"
3,"So because the Asian community was so by need tight-knit, the next thing you know I have Asian kids following me around, you know doing more shit for me than the boys that wanted inside of me. Especially the two in particular. They thought I was “so wonderful” which sickened me even further. I was not wonderful, it reminds me of that quote, from WWII era forget it but goes on to say something like, “It was not that I was a hero, it was everyone around me was acting so badly” when someone won a humanitarian award."
4,Nicholas’ brain runs on and is excellent at processing data and concrete facts. He works best with things he can quantify. He runs into an almost insurmountable challenge in trying to rationalize the existence of his exceptional infant daughter and what she represents. He isn’t without a sense of humor and is very loving but he is forced to radically confront his limitations.


In [4]:
df.shape

(1000, 1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   words   1000 non-null   object
dtypes: object(1)
memory usage: 7.9+ KB


In [6]:
df.describe()

Unnamed: 0,words
count,1000
unique,1000
top,2.Send a kind wobbly fish thought to someone you know who could do with it.
freq,1


In [7]:
len(df.words)

1000

### Tokenization

In [8]:
df['tokenized'] = df['words'].apply(word_tokenize)

In [9]:
df.head()

Unnamed: 0,words,tokenized
0,"We have given our hearts away, a sordid boon !”","[We, have, given, our, hearts, away, ,, a, sordid, boon, !, ”]"
1,1. Start it on the side,"[1, ., Start, it, on, the, side]"
2,"Sugar’s sweet, so is she,","[Sugar, ’, s, sweet, ,, so, is, she, ,]"
3,"So because the Asian community was so by need tight-knit, the next thing you know I have Asian kids following me around, you know doing more shit for me than the boys that wanted inside of me. Especially the two in particular. They thought I was “so wonderful” which sickened me even further. I was not wonderful, it reminds me of that quote, from WWII era forget it but goes on to say something like, “It was not that I was a hero, it was everyone around me was acting so badly” when someone won a humanitarian award.","[So, because, the, Asian, community, was, so, by, need, tight-knit, ,, the, next, thing, you, know, I, have, Asian, kids, following, me, around, ,, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, ., Especially, the, two, in, particular, ., They, thought, I, was, “, so, wonderful, ”, which, sickened, me, even, further, ., I, was, not, wonderful, ,, it, reminds, me, of, that, quote, ,, from, WWII, era, forget, it, but, goes, on, to, say, something, like, ,, “, It, was, not, that, I, was, a, hero, ,, it, was, everyone, around, me, ...]"
4,Nicholas’ brain runs on and is excellent at processing data and concrete facts. He works best with things he can quantify. He runs into an almost insurmountable challenge in trying to rationalize the existence of his exceptional infant daughter and what she represents. He isn’t without a sense of humor and is very loving but he is forced to radically confront his limitations.,"[Nicholas, ’, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, ., He, works, best, with, things, he, can, quantify, ., He, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, ., He, isn, ’, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations, .]"


### Removing Punctuations

In [10]:
punc = string.punctuation

In [11]:
df['no_punc'] = df['tokenized'].apply(lambda x: [word for word in x if word not in punc])

In [12]:
df.head()

Unnamed: 0,words,tokenized,no_punc
0,"We have given our hearts away, a sordid boon !”","[We, have, given, our, hearts, away, ,, a, sordid, boon, !, ”]","[We, have, given, our, hearts, away, a, sordid, boon, ”]"
1,1. Start it on the side,"[1, ., Start, it, on, the, side]","[1, Start, it, on, the, side]"
2,"Sugar’s sweet, so is she,","[Sugar, ’, s, sweet, ,, so, is, she, ,]","[Sugar, ’, s, sweet, so, is, she]"
3,"So because the Asian community was so by need tight-knit, the next thing you know I have Asian kids following me around, you know doing more shit for me than the boys that wanted inside of me. Especially the two in particular. They thought I was “so wonderful” which sickened me even further. I was not wonderful, it reminds me of that quote, from WWII era forget it but goes on to say something like, “It was not that I was a hero, it was everyone around me was acting so badly” when someone won a humanitarian award.","[So, because, the, Asian, community, was, so, by, need, tight-knit, ,, the, next, thing, you, know, I, have, Asian, kids, following, me, around, ,, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, ., Especially, the, two, in, particular, ., They, thought, I, was, “, so, wonderful, ”, which, sickened, me, even, further, ., I, was, not, wonderful, ,, it, reminds, me, of, that, quote, ,, from, WWII, era, forget, it, but, goes, on, to, say, something, like, ,, “, It, was, not, that, I, was, a, hero, ,, it, was, everyone, around, me, ...]","[So, because, the, Asian, community, was, so, by, need, tight-knit, the, next, thing, you, know, I, have, Asian, kids, following, me, around, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, Especially, the, two, in, particular, They, thought, I, was, “, so, wonderful, ”, which, sickened, me, even, further, I, was, not, wonderful, it, reminds, me, of, that, quote, from, WWII, era, forget, it, but, goes, on, to, say, something, like, “, It, was, not, that, I, was, a, hero, it, was, everyone, around, me, was, acting, so, badly, ”, when, someone, won, a, ...]"
4,Nicholas’ brain runs on and is excellent at processing data and concrete facts. He works best with things he can quantify. He runs into an almost insurmountable challenge in trying to rationalize the existence of his exceptional infant daughter and what she represents. He isn’t without a sense of humor and is very loving but he is forced to radically confront his limitations.,"[Nicholas, ’, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, ., He, works, best, with, things, he, can, quantify, ., He, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, ., He, isn, ’, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations, .]","[Nicholas, ’, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, He, works, best, with, things, he, can, quantify, He, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, He, isn, ’, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations]"


### Removing numbers and remaining punctuation marks

In [13]:
df['no_numbers'] = df['no_punc'].apply(lambda x: [word for word in x if word.isalpha()])

In [14]:
df.head()

Unnamed: 0,words,tokenized,no_punc,no_numbers
0,"We have given our hearts away, a sordid boon !”","[We, have, given, our, hearts, away, ,, a, sordid, boon, !, ”]","[We, have, given, our, hearts, away, a, sordid, boon, ”]","[We, have, given, our, hearts, away, a, sordid, boon]"
1,1. Start it on the side,"[1, ., Start, it, on, the, side]","[1, Start, it, on, the, side]","[Start, it, on, the, side]"
2,"Sugar’s sweet, so is she,","[Sugar, ’, s, sweet, ,, so, is, she, ,]","[Sugar, ’, s, sweet, so, is, she]","[Sugar, s, sweet, so, is, she]"
3,"So because the Asian community was so by need tight-knit, the next thing you know I have Asian kids following me around, you know doing more shit for me than the boys that wanted inside of me. Especially the two in particular. They thought I was “so wonderful” which sickened me even further. I was not wonderful, it reminds me of that quote, from WWII era forget it but goes on to say something like, “It was not that I was a hero, it was everyone around me was acting so badly” when someone won a humanitarian award.","[So, because, the, Asian, community, was, so, by, need, tight-knit, ,, the, next, thing, you, know, I, have, Asian, kids, following, me, around, ,, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, ., Especially, the, two, in, particular, ., They, thought, I, was, “, so, wonderful, ”, which, sickened, me, even, further, ., I, was, not, wonderful, ,, it, reminds, me, of, that, quote, ,, from, WWII, era, forget, it, but, goes, on, to, say, something, like, ,, “, It, was, not, that, I, was, a, hero, ,, it, was, everyone, around, me, ...]","[So, because, the, Asian, community, was, so, by, need, tight-knit, the, next, thing, you, know, I, have, Asian, kids, following, me, around, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, Especially, the, two, in, particular, They, thought, I, was, “, so, wonderful, ”, which, sickened, me, even, further, I, was, not, wonderful, it, reminds, me, of, that, quote, from, WWII, era, forget, it, but, goes, on, to, say, something, like, “, It, was, not, that, I, was, a, hero, it, was, everyone, around, me, was, acting, so, badly, ”, when, someone, won, a, ...]","[So, because, the, Asian, community, was, so, by, need, the, next, thing, you, know, I, have, Asian, kids, following, me, around, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, Especially, the, two, in, particular, They, thought, I, was, so, wonderful, which, sickened, me, even, further, I, was, not, wonderful, it, reminds, me, of, that, quote, from, WWII, era, forget, it, but, goes, on, to, say, something, like, It, was, not, that, I, was, a, hero, it, was, everyone, around, me, was, acting, so, badly, when, someone, won, a, humanitarian, award]"
4,Nicholas’ brain runs on and is excellent at processing data and concrete facts. He works best with things he can quantify. He runs into an almost insurmountable challenge in trying to rationalize the existence of his exceptional infant daughter and what she represents. He isn’t without a sense of humor and is very loving but he is forced to radically confront his limitations.,"[Nicholas, ’, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, ., He, works, best, with, things, he, can, quantify, ., He, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, ., He, isn, ’, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations, .]","[Nicholas, ’, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, He, works, best, with, things, he, can, quantify, He, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, He, isn, ’, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations]","[Nicholas, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, He, works, best, with, things, he, can, quantify, He, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, He, isn, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations]"


### Lower Case Letters Conversion

In [15]:
df['final'] = df['no_numbers'].apply(lambda x: [word.lower() for word in x])

In [16]:
df.head()

Unnamed: 0,words,tokenized,no_punc,no_numbers,final
0,"We have given our hearts away, a sordid boon !”","[We, have, given, our, hearts, away, ,, a, sordid, boon, !, ”]","[We, have, given, our, hearts, away, a, sordid, boon, ”]","[We, have, given, our, hearts, away, a, sordid, boon]","[we, have, given, our, hearts, away, a, sordid, boon]"
1,1. Start it on the side,"[1, ., Start, it, on, the, side]","[1, Start, it, on, the, side]","[Start, it, on, the, side]","[start, it, on, the, side]"
2,"Sugar’s sweet, so is she,","[Sugar, ’, s, sweet, ,, so, is, she, ,]","[Sugar, ’, s, sweet, so, is, she]","[Sugar, s, sweet, so, is, she]","[sugar, s, sweet, so, is, she]"
3,"So because the Asian community was so by need tight-knit, the next thing you know I have Asian kids following me around, you know doing more shit for me than the boys that wanted inside of me. Especially the two in particular. They thought I was “so wonderful” which sickened me even further. I was not wonderful, it reminds me of that quote, from WWII era forget it but goes on to say something like, “It was not that I was a hero, it was everyone around me was acting so badly” when someone won a humanitarian award.","[So, because, the, Asian, community, was, so, by, need, tight-knit, ,, the, next, thing, you, know, I, have, Asian, kids, following, me, around, ,, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, ., Especially, the, two, in, particular, ., They, thought, I, was, “, so, wonderful, ”, which, sickened, me, even, further, ., I, was, not, wonderful, ,, it, reminds, me, of, that, quote, ,, from, WWII, era, forget, it, but, goes, on, to, say, something, like, ,, “, It, was, not, that, I, was, a, hero, ,, it, was, everyone, around, me, ...]","[So, because, the, Asian, community, was, so, by, need, tight-knit, the, next, thing, you, know, I, have, Asian, kids, following, me, around, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, Especially, the, two, in, particular, They, thought, I, was, “, so, wonderful, ”, which, sickened, me, even, further, I, was, not, wonderful, it, reminds, me, of, that, quote, from, WWII, era, forget, it, but, goes, on, to, say, something, like, “, It, was, not, that, I, was, a, hero, it, was, everyone, around, me, was, acting, so, badly, ”, when, someone, won, a, ...]","[So, because, the, Asian, community, was, so, by, need, the, next, thing, you, know, I, have, Asian, kids, following, me, around, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, Especially, the, two, in, particular, They, thought, I, was, so, wonderful, which, sickened, me, even, further, I, was, not, wonderful, it, reminds, me, of, that, quote, from, WWII, era, forget, it, but, goes, on, to, say, something, like, It, was, not, that, I, was, a, hero, it, was, everyone, around, me, was, acting, so, badly, when, someone, won, a, humanitarian, award]","[so, because, the, asian, community, was, so, by, need, the, next, thing, you, know, i, have, asian, kids, following, me, around, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, especially, the, two, in, particular, they, thought, i, was, so, wonderful, which, sickened, me, even, further, i, was, not, wonderful, it, reminds, me, of, that, quote, from, wwii, era, forget, it, but, goes, on, to, say, something, like, it, was, not, that, i, was, a, hero, it, was, everyone, around, me, was, acting, so, badly, when, someone, won, a, humanitarian, award]"
4,Nicholas’ brain runs on and is excellent at processing data and concrete facts. He works best with things he can quantify. He runs into an almost insurmountable challenge in trying to rationalize the existence of his exceptional infant daughter and what she represents. He isn’t without a sense of humor and is very loving but he is forced to radically confront his limitations.,"[Nicholas, ’, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, ., He, works, best, with, things, he, can, quantify, ., He, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, ., He, isn, ’, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations, .]","[Nicholas, ’, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, He, works, best, with, things, he, can, quantify, He, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, He, isn, ’, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations]","[Nicholas, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, He, works, best, with, things, he, can, quantify, He, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, He, isn, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations]","[nicholas, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, he, works, best, with, things, he, can, quantify, he, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, he, isn, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations]"


### Drop columns and save to csv

In [17]:
df.columns

Index(['words', 'tokenized', 'no_punc', 'no_numbers', 'final'], dtype='object')

In [18]:
df.drop(['words', 'tokenized', 'no_punc', 'no_numbers'], axis=1, inplace=True)

In [19]:
df.head()

Unnamed: 0,final
0,"[we, have, given, our, hearts, away, a, sordid, boon]"
1,"[start, it, on, the, side]"
2,"[sugar, s, sweet, so, is, she]"
3,"[so, because, the, asian, community, was, so, by, need, the, next, thing, you, know, i, have, asian, kids, following, me, around, you, know, doing, more, shit, for, me, than, the, boys, that, wanted, inside, of, me, especially, the, two, in, particular, they, thought, i, was, so, wonderful, which, sickened, me, even, further, i, was, not, wonderful, it, reminds, me, of, that, quote, from, wwii, era, forget, it, but, goes, on, to, say, something, like, it, was, not, that, i, was, a, hero, it, was, everyone, around, me, was, acting, so, badly, when, someone, won, a, humanitarian, award]"
4,"[nicholas, brain, runs, on, and, is, excellent, at, processing, data, and, concrete, facts, he, works, best, with, things, he, can, quantify, he, runs, into, an, almost, insurmountable, challenge, in, trying, to, rationalize, the, existence, of, his, exceptional, infant, daughter, and, what, she, represents, he, isn, t, without, a, sense, of, humor, and, is, very, loving, but, he, is, forced, to, radically, confront, his, limitations]"


In [20]:
#df.to_csv("blogclean1000.csv",index=False)

### Exploratory Data Analysis and Visualization

In [21]:
df = pd.read_csv("blogclean1000.csv")

In [22]:
df.head()

Unnamed: 0,final
0,"['we', 'have', 'given', 'our', 'hearts', 'away', 'a', 'sordid', 'boon']"
1,"['start', 'it', 'on', 'the', 'side']"
2,"['sugar', 's', 'sweet', 'so', 'is', 'she']"
3,"['so', 'because', 'the', 'asian', 'community', 'was', 'so', 'by', 'need', 'the', 'next', 'thing', 'you', 'know', 'i', 'have', 'asian', 'kids', 'following', 'me', 'around', 'you', 'know', 'doing', 'more', 'shit', 'for', 'me', 'than', 'the', 'boys', 'that', 'wanted', 'inside', 'of', 'me', 'especially', 'the', 'two', 'in', 'particular', 'they', 'thought', 'i', 'was', 'so', 'wonderful', 'which', 'sickened', 'me', 'even', 'further', 'i', 'was', 'not', 'wonderful', 'it', 'reminds', 'me', 'of', 'that', 'quote', 'from', 'wwii', 'era', 'forget', 'it', 'but', 'goes', 'on', 'to', 'say', 'something', 'like', 'it', 'was', 'not', 'that', 'i', 'was', 'a', 'hero', 'it', 'was', 'everyone', 'around', 'me', 'was', 'acting', 'so', 'badly', 'when', 'someone', 'won', 'a', 'humanitarian', 'award']"
4,"['nicholas', 'brain', 'runs', 'on', 'and', 'is', 'excellent', 'at', 'processing', 'data', 'and', 'concrete', 'facts', 'he', 'works', 'best', 'with', 'things', 'he', 'can', 'quantify', 'he', 'runs', 'into', 'an', 'almost', 'insurmountable', 'challenge', 'in', 'trying', 'to', 'rationalize', 'the', 'existence', 'of', 'his', 'exceptional', 'infant', 'daughter', 'and', 'what', 'she', 'represents', 'he', 'isn', 't', 'without', 'a', 'sense', 'of', 'humor', 'and', 'is', 'very', 'loving', 'but', 'he', 'is', 'forced', 'to', 'radically', 'confront', 'his', 'limitations']"


In [23]:
df['word_count'] = df['final'].apply(lambda x: len(str(x).split()))

In [24]:
df.head()

Unnamed: 0,final,word_count
0,"['we', 'have', 'given', 'our', 'hearts', 'away', 'a', 'sordid', 'boon']",9
1,"['start', 'it', 'on', 'the', 'side']",5
2,"['sugar', 's', 'sweet', 'so', 'is', 'she']",6
3,"['so', 'because', 'the', 'asian', 'community', 'was', 'so', 'by', 'need', 'the', 'next', 'thing', 'you', 'know', 'i', 'have', 'asian', 'kids', 'following', 'me', 'around', 'you', 'know', 'doing', 'more', 'shit', 'for', 'me', 'than', 'the', 'boys', 'that', 'wanted', 'inside', 'of', 'me', 'especially', 'the', 'two', 'in', 'particular', 'they', 'thought', 'i', 'was', 'so', 'wonderful', 'which', 'sickened', 'me', 'even', 'further', 'i', 'was', 'not', 'wonderful', 'it', 'reminds', 'me', 'of', 'that', 'quote', 'from', 'wwii', 'era', 'forget', 'it', 'but', 'goes', 'on', 'to', 'say', 'something', 'like', 'it', 'was', 'not', 'that', 'i', 'was', 'a', 'hero', 'it', 'was', 'everyone', 'around', 'me', 'was', 'acting', 'so', 'badly', 'when', 'someone', 'won', 'a', 'humanitarian', 'award']",97
4,"['nicholas', 'brain', 'runs', 'on', 'and', 'is', 'excellent', 'at', 'processing', 'data', 'and', 'concrete', 'facts', 'he', 'works', 'best', 'with', 'things', 'he', 'can', 'quantify', 'he', 'runs', 'into', 'an', 'almost', 'insurmountable', 'challenge', 'in', 'trying', 'to', 'rationalize', 'the', 'existence', 'of', 'his', 'exceptional', 'infant', 'daughter', 'and', 'what', 'she', 'represents', 'he', 'isn', 't', 'without', 'a', 'sense', 'of', 'humor', 'and', 'is', 'very', 'loving', 'but', 'he', 'is', 'forced', 'to', 'radically', 'confront', 'his', 'limitations']",64


In [25]:
df['char_count'] = df["final"].apply(lambda x: len(x))

In [26]:
df.head()

Unnamed: 0,final,word_count,char_count
0,"['we', 'have', 'given', 'our', 'hearts', 'away', 'a', 'sordid', 'boon']",9,71
1,"['start', 'it', 'on', 'the', 'side']",5,36
2,"['sugar', 's', 'sweet', 'so', 'is', 'she']",6,42
3,"['so', 'because', 'the', 'asian', 'community', 'was', 'so', 'by', 'need', 'the', 'next', 'thing', 'you', 'know', 'i', 'have', 'asian', 'kids', 'following', 'me', 'around', 'you', 'know', 'doing', 'more', 'shit', 'for', 'me', 'than', 'the', 'boys', 'that', 'wanted', 'inside', 'of', 'me', 'especially', 'the', 'two', 'in', 'particular', 'they', 'thought', 'i', 'was', 'so', 'wonderful', 'which', 'sickened', 'me', 'even', 'further', 'i', 'was', 'not', 'wonderful', 'it', 'reminds', 'me', 'of', 'that', 'quote', 'from', 'wwii', 'era', 'forget', 'it', 'but', 'goes', 'on', 'to', 'say', 'something', 'like', 'it', 'was', 'not', 'that', 'i', 'was', 'a', 'hero', 'it', 'was', 'everyone', 'around', 'me', 'was', 'acting', 'so', 'badly', 'when', 'someone', 'won', 'a', 'humanitarian', 'award']",97,785
4,"['nicholas', 'brain', 'runs', 'on', 'and', 'is', 'excellent', 'at', 'processing', 'data', 'and', 'concrete', 'facts', 'he', 'works', 'best', 'with', 'things', 'he', 'can', 'quantify', 'he', 'runs', 'into', 'an', 'almost', 'insurmountable', 'challenge', 'in', 'trying', 'to', 'rationalize', 'the', 'existence', 'of', 'his', 'exceptional', 'infant', 'daughter', 'and', 'what', 'she', 'represents', 'he', 'isn', 't', 'without', 'a', 'sense', 'of', 'humor', 'and', 'is', 'very', 'loving', 'but', 'he', 'is', 'forced', 'to', 'radically', 'confront', 'his', 'limitations']",64,566


In [27]:
df.describe()

Unnamed: 0,word_count,char_count
count,1000.0,1000.0
mean,39.02,325.504
std,46.089715,383.393297
min,1.0,2.0
25%,8.0,64.0
50%,24.0,205.5
75%,54.25,464.75
max,655.0,5516.0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   final       1000 non-null   object
 1   word_count  1000 non-null   int64 
 2   char_count  1000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 23.6+ KB


In [44]:
words = df.final
allwords = []
for wordlist in words:
    allwords+= wordlist

In [45]:
allwords

['[',
 "'",
 'w',
 'e',
 "'",
 ',',
 ' ',
 "'",
 'h',
 'a',
 'v',
 'e',
 "'",
 ',',
 ' ',
 "'",
 'g',
 'i',
 'v',
 'e',
 'n',
 "'",
 ',',
 ' ',
 "'",
 'o',
 'u',
 'r',
 "'",
 ',',
 ' ',
 "'",
 'h',
 'e',
 'a',
 'r',
 't',
 's',
 "'",
 ',',
 ' ',
 "'",
 'a',
 'w',
 'a',
 'y',
 "'",
 ',',
 ' ',
 "'",
 'a',
 "'",
 ',',
 ' ',
 "'",
 's',
 'o',
 'r',
 'd',
 'i',
 'd',
 "'",
 ',',
 ' ',
 "'",
 'b',
 'o',
 'o',
 'n',
 "'",
 ']',
 '[',
 "'",
 's',
 't',
 'a',
 'r',
 't',
 "'",
 ',',
 ' ',
 "'",
 'i',
 't',
 "'",
 ',',
 ' ',
 "'",
 'o',
 'n',
 "'",
 ',',
 ' ',
 "'",
 't',
 'h',
 'e',
 "'",
 ',',
 ' ',
 "'",
 's',
 'i',
 'd',
 'e',
 "'",
 ']',
 '[',
 "'",
 's',
 'u',
 'g',
 'a',
 'r',
 "'",
 ',',
 ' ',
 "'",
 's',
 "'",
 ',',
 ' ',
 "'",
 's',
 'w',
 'e',
 'e',
 't',
 "'",
 ',',
 ' ',
 "'",
 's',
 'o',
 "'",
 ',',
 ' ',
 "'",
 'i',
 's',
 "'",
 ',',
 ' ',
 "'",
 's',
 'h',
 'e',
 "'",
 ']',
 '[',
 "'",
 's',
 'o',
 "'",
 ',',
 ' ',
 "'",
 'b',
 'e',
 'c',
 'a',
 'u',
 's',
 'e',
 "'",
 ',',
 ' '

In [38]:
type(wordlist)

str

In [39]:
len(wordlist)

290

### Save as Textfile

In [None]:
#textformat = ''.join(df["words"])

In [None]:
#textformat

In [None]:
#textfile = open('textfile2.txt','w', encoding="utf-8")

In [None]:
#textfile.write(textformat)

In [None]:
#textfile.close()

***

#### Coded and submitted by Dennis Lam 2021