In [10]:
import pandas as pd
import os
import liwc

In [12]:
parse, category_names = liwc.load_token_parser('LIWC2007_English080730.dic')

In [27]:
def _parse_categories(lines):
    """
    Read (category_id, category_name) pairs from the categories section.
    Each line consists of an integer followed a tab and then the category name.
    This section is separated from the lexicon by a line consisting of a single "%".
    """
    for line in lines:
        line = line.strip()
        if line == "%":
            return
        # ignore non-matching groups of categories
        if "\t" in line:
            category_id, category_name = line.split("\t", 1)
            yield category_id, category_name

In [28]:
def _parse_lexicon(lines, category_mapping):
    """
    Read (match_expression, category_names) pairs from the lexicon section.
    Each line consists of a match expression followed by a tab and then one or more
    tab-separated integers, which are mapped to category names using `category_mapping`.
    """
    for line in lines:
        line = line.strip()
        parts = line.split("\t")
        yield parts[0], [category_mapping[category_id] for category_id in parts[1:]]

In [29]:
def read_dic(filepath):
    """
    Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
    (lexicon, category_names), where:
    * `lexicon` is a dict mapping string patterns to lists of category names
    * `category_names` is a list of category names (as strings)
    """
    with open(filepath) as lines:
        # read up to first "%" (should be very first line of file)
        for line in lines:
            if line.strip() == "%":
                break
        # read categories (a mapping from integer string to category name)
        category_mapping = dict(_parse_categories(lines))
        # read lexicon (a mapping from matching string to a list of category names)
        lexicon = dict(_parse_lexicon(lines, category_mapping))
    return lexicon, list(category_mapping.values())

In [39]:
dizionario,lista = read_dic('LIWC2007_English080730.dic')

In [42]:
import json
with open('liwc_dic.json', 'w') as fp:
    json.dump(dizionario, fp)

In [48]:
dizionario

{'a': ['funct', 'article'],
 'abandon*': ['affect', 'negemo', 'sad', 'cogmech', 'inhib'],
 'abdomen*': ['bio', 'body'],
 'abilit*': ['achieve'],
 'able*': ['achieve'],
 'abortion*': ['bio', 'health', 'sexual'],
 'about': ['funct', 'adverb', 'preps'],
 'above': ['funct', 'preps', 'space', 'relativ'],
 'abrupt*': ['time', 'relativ'],
 'abs': ['bio', 'body'],
 'absent*': ['work'],
 'absolute': ['cogmech', 'certain'],
 'absolutely': ['funct', 'adverb', 'cogmech', 'certain', 'assent'],
 'abstain*': ['cogmech', 'inhib'],
 'abuse*': ['affect', 'negemo', 'anger'],
 'abusi*': ['affect', 'negemo', 'anger'],
 'academ*': ['work'],
 'accept': ['affect', 'posemo', 'cogmech', 'insight'],
 'accepta*': ['affect', 'posemo', 'cogmech', 'insight'],
 'accepted': ['verb', 'past', 'affect', 'posemo', 'cogmech', 'insight'],
 'accepting': ['affect', 'posemo', 'cogmech', 'insight'],
 'accepts': ['affect', 'posemo', 'cogmech', 'insight'],
 'accomplish*': ['work', 'achieve'],
 'account*': ['money'],
 'accura*': [

In [47]:
df = pd.DataFrame(data=dizionario)


ValueError: All arrays must be of the same length